Example #1
0
    def setup_tables(cls):
        """ Creates the SQLite tables we need, not the tables we deserve.
		"""
        if cls.sqlite_handle is None:
            cls.sqlite_handle = SQLite.get("databases/snomed.db")

            # descriptions
        cls.sqlite_handle.create(
            "descriptions",
            """(
				concept_id INTEGER PRIMARY KEY,
				lang TEXT,
				term TEXT,
				isa VARCHAR,
				active INT
			)""",
        )
        cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS isa_index ON descriptions (isa)")

        # relationships
        cls.sqlite_handle.create(
            "relationships",
            """(
				relationship_id INTEGER PRIMARY KEY,
				source_id INT,
				destination_id INT,
				rel_type INT,
				rel_text VARCHAR,
				active INT
			)""",
        )
        cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS source_index ON relationships (source_id)")
        cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS destination_index ON relationships (destination_id)")
        cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS rel_type_index ON relationships (rel_type)")
        cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS rel_text_index ON relationships (rel_text)")
Example #2
0
	def setup_tables(cls):
		""" Creates the SQLite tables we need, not the tables we deserve.
		Does nothing if the tables/indexes already exist
		"""
		if cls.sqlite_handle is None:
			cls.sqlite_handle = SQLite.get(os.path.join('databases', 'snomed.db'))
		
		# descriptions
		cls.sqlite_handle.create('descriptions', '''(
				concept_id INTEGER PRIMARY KEY,
				lang TEXT,
				term TEXT,
				isa VARCHAR,
				active INT
			)''')
		cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS isa_index ON descriptions (isa)")
		
		# relationships
		cls.sqlite_handle.create('relationships', '''(
				relationship_id INTEGER PRIMARY KEY,
				source_id INT,
				destination_id INT,
				rel_type INT,
				rel_text VARCHAR,
				active INT
			)''')
		cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS source_index ON relationships (source_id)")
		cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS destination_index ON relationships (destination_id)")
		cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS rel_type_index ON relationships (rel_type)")
		cls.sqlite_handle.execute("CREATE INDEX IF NOT EXISTS rel_text_index ON relationships (rel_text)")
Example #3
0
	def setup_tables(cls):
		""" Creates the SQLite tables we need, not the tables we deserve.
		Does nothing if the tables/indexes already exist
		"""
		if cls.sqlite_handle is None:
			cls.sqlite_handle = SQLite.get(cls.database_path())
		
		# descriptions
		cls.sqlite_handle.create('descriptions', '''(
				concept_id INTEGER PRIMARY KEY,
				lang TEXT,
				term TEXT,
				isa VARCHAR,
				active INT
			)''')
		
		# relationships
		cls.sqlite_handle.create('relationships', '''(
				relationship_id INTEGER PRIMARY KEY,
				source_id INT,
				destination_id INT,
				rel_type INT,
				rel_text VARCHAR,
				active INT
			)''')
Example #4
0
    def setup_tables(cls):
        """ Creates the SQLite tables we need, not the tables we deserve.
		Does nothing if the tables/indexes already exist
		"""
        if cls.sqlite_handle is None:
            cls.sqlite_handle = SQLite.get(cls.database_path())

        # descriptions
        cls.sqlite_handle.create(
            'descriptions', '''(
				concept_id INTEGER PRIMARY KEY,
				lang TEXT,
				term TEXT,
				isa VARCHAR,
				active INT
			)''')

        # relationships
        cls.sqlite_handle.create(
            'relationships', '''(
				relationship_id INTEGER PRIMARY KEY,
				source_id INT,
				destination_id INT,
				rel_type INT,
				rel_text VARCHAR,
				active INT
			)''')
Example #5
0
	def setup(cls):
		""" Makes sure we have a database """
		
		if cls.did_setup:
			return
		
		# init the database if needed
		if not os.path.exists(DB_FILE):
			
			# make sure the parent directory exists
			if len(os.path.dirname(DB_FILE)) > 0 \
				and not os.path.exists(os.path.dirname(DB_FILE)):
				try:
					os.makedirs(os.path.dirname(DB_FILE))
				except Exception, e:
					print "Failed to create %s: %s" % (os.path.dirname(DB_FILE), e)
					return
			
			# database init
			sql = SQLite.get(DB_FILE)
			sql.create('record_tokens', '''(
					token_id INTEGER PRIMARY KEY,
					record_id INT,
					on_server VARCHAR,
					cookie VARCHAR,
					token VARCHAR,
					secret VARCHAR,
					added TIMESTAMP,
					CONSTRAINT record_server UNIQUE (record_id, on_server) ON CONFLICT REPLACE
				)''')
			sql.execute("CREATE INDEX IF NOT EXISTS record_index ON record_tokens (record_id)")
			sql.execute("CREATE INDEX IF NOT EXISTS server_index ON record_tokens (on_server)")
			sql.execute("CREATE INDEX IF NOT EXISTS cookie_index ON record_tokens (cookie)")
			sql.execute("CREATE INDEX IF NOT EXISTS token_index ON record_tokens (token)")
Example #6
0
	def setup_tables(cls):
		""" Creates the SQLite tables and imports SNOMED from flat files, if
		not already done
		"""
		if cls.sqlite_handle is None:
			cls.sqlite_handle = SQLite.get('umls.db')
		
		cls.sqlite_handle.create('snomed', '''(
				concept_id INTEGER PRIMARY KEY,
				lang TEXT,
				term TEXT
			)''')
		
		cls.import_snomed_from_csv()
Example #7
0
    def __init__(self):
        super().__init__()
        from sqlite import SQLite

        absolute = os.path.dirname(os.path.realpath(__file__))
        db_file = os.environ.get("SQLITE_FILE")
        db_file = db_file if db_file else os.path.join(absolute, "databases/rxnorm.db")
        self.db_file = db_file
        self.handled = 0

        self.sqlite = SQLite.get(self.db_file)
        self.sqlite.execute("DROP TABLE IF EXISTS drug_cache")

        self.sqlite.execute(
            """CREATE TABLE drug_cache
						(rxcui varchar, property text, value text)"""
        )

        self.sqlite.execute("CREATE INDEX i_drug_cache ON drug_cache (rxcui, property)")

        self.sqlite.execute("DROP VIEW IF EXISTS drug_treatments_by_ndc")
        self.sqlite.execute(
            """CREATE VIEW drug_treatments_by_ndc as
				select a.value as ndc, b.value as treatment_intent
				from drug_cache a join drug_cache b on a.rxcui=b.rxcui
				where a.property='ndc' and b.property='treatment_intent'
				"""
        )

        self.sqlite.execute("DROP VIEW IF EXISTS drug_classes_by_ndc")
        self.sqlite.execute(
            """CREATE VIEW drug_classes_by_ndc as
				select a.value as ndc, b.value as drug_class
				from drug_cache a join drug_cache b on a.rxcui=b.rxcui
				where a.property='ndc' and b.property='drug_class'
				"""
        )

        self.sqlite.execute("DROP VIEW IF EXISTS drug_ingredients_by_ndc")
        self.sqlite.execute(
            """CREATE VIEW drug_ingredients_by_ndc as
				select a.value as ndc, b.value as drug_ingredient, c.str as ingredient_name
				from drug_cache a join drug_cache b on a.rxcui=b.rxcui
				join RXNCONSO c on c.rxcui=b.value
				where a.property='ndc' and b.property='ingredient'
				and c.sab='RXNORM' and c.tty='IN'
                """
        )
Example #8
0
	def setup_tables(cls):
		if cls.sqlite_handle is None:
			cls.sqlite_handle = SQLite.get('storage.db')
		
		cls.sqlite_handle.create('studies', '''(
			nct UNIQUE,
			updated TIMESTAMP,
			elig_gender INTEGER,
			elig_min_age INTEGER,
			elig_max_age INTEGER,
			elig_population TEXT,
			elig_sampling TEXT,
			elig_accept_healthy INTEGER DEFAULT 0,
			elig_criteria TEXT
		)''')
		
		StudyEligibility.setup_tables()
    def __init__(self):
        super().__init__()
        from sqlite import SQLite
        absolute = os.path.dirname(os.path.realpath(__file__))
        db_file = os.environ.get('SQLITE_FILE')
        db_file = db_file if db_file else os.path.join(absolute,
                                                       'databases/rxnorm.db')
        self.db_file = db_file
        self.handled = 0

        self.sqlite = SQLite.get(self.db_file)
        self.sqlite.execute('DROP TABLE IF EXISTS drug_cache')

        self.sqlite.execute('''CREATE TABLE drug_cache
						(rxcui varchar, property text, value text)''')

        self.sqlite.execute(
            'CREATE INDEX i_drug_cache ON drug_cache (rxcui, property)')

        self.sqlite.execute('DROP VIEW IF EXISTS drug_treatments_by_ndc')
        self.sqlite.execute('''CREATE VIEW drug_treatments_by_ndc as
				select a.value as ndc, b.value as treatment_intent
				from drug_cache a join drug_cache b on a.rxcui=b.rxcui
				where a.property='ndc' and b.property='treatment_intent'
				''')

        self.sqlite.execute('DROP VIEW IF EXISTS drug_classes_by_ndc')
        self.sqlite.execute('''CREATE VIEW drug_classes_by_ndc as
				select a.value as ndc, b.value as drug_class
				from drug_cache a join drug_cache b on a.rxcui=b.rxcui
				where a.property='ndc' and b.property='drug_class'
				''')

        self.sqlite.execute('DROP VIEW IF EXISTS drug_ingredients_by_ndc')
        self.sqlite.execute('''CREATE VIEW drug_ingredients_by_ndc as
				select a.value as ndc, b.value as drug_ingredient, c.str as ingredient_name
				from drug_cache a join drug_cache b on a.rxcui=b.rxcui
				join RXNCONSO c on c.rxcui=b.value
				where a.property='ndc' and b.property='ingredient'
				and c.sab='RXNORM' and c.tty='IN'
                ''')
Example #10
0
    def setup_tables(cls):
        """ Creates the SQLite tables we need, not the tables we deserve.
		"""
        if cls.sqlite_handle is None:
            cls.sqlite_handle = SQLite.get('databases/snomed.db')

        # descriptions
        cls.sqlite_handle.create(
            'descriptions', '''(
				concept_id INTEGER PRIMARY KEY,
				lang TEXT,
				term TEXT,
				isa VARCHAR,
				active INT
			)''')
        cls.sqlite_handle.execute(
            "CREATE INDEX IF NOT EXISTS isa_index ON descriptions (isa)")

        # relationships
        cls.sqlite_handle.create(
            'relationships', '''(
				relationship_id INTEGER PRIMARY KEY,
				source_id INT,
				destination_id INT,
				rel_type INT,
				rel_text VARCHAR,
				active INT
			)''')
        cls.sqlite_handle.execute(
            "CREATE INDEX IF NOT EXISTS source_index ON relationships (source_id)"
        )
        cls.sqlite_handle.execute(
            "CREATE INDEX IF NOT EXISTS destination_index ON relationships (destination_id)"
        )
        cls.sqlite_handle.execute(
            "CREATE INDEX IF NOT EXISTS rel_type_index ON relationships (rel_type)"
        )
        cls.sqlite_handle.execute(
            "CREATE INDEX IF NOT EXISTS rel_text_index ON relationships (rel_text)"
        )
Example #11
0
 def __init__(self):
     self.sqlite = SQLite.get("databases/rxnorm.db")
Example #12
0
 def __init__(self):
     self.sqlite = SQLite.get("databases/snomed.db")
Example #13
0
	def __init__(self):
		absolute = os.path.dirname(os.path.realpath(__file__))
		self.sqlite = SQLite.get(os.path.join(absolute, 'databases/umls.db'))
Example #14
0
 def __init__(self):
     absoulte = os.path.dirname(os.path.realpath(__file__))
     self.sqlite = SQLite.get("../../databases/umls.db")
Example #15
0
 def __init__(self):
     self.sqlite = SQLite.get("databases/umls.db")
Example #16
0
 def __init__(self):
     self.sqlite = SQLite.get('databases/snomed.db')
class ToiScraper():
  TABLE_NAME = 'articles'
  TABLE_SCHEMA = [(u'ds', u'text(10)'), (u'title', u'text'), (u'url', u'text')]
  # Manually observed minimum date on TOI
  INIT_DATE = (2000, 1, 18)
  MIN_ENTRIES = 600
  MAX_SLEEP = 3600

  def __init__(self):
    # ====  Required vars ===== #
    self.stdin_path = '/dev/null'
    self.stdout_path = '/dev/null'
    self.stderr_path = '/dev/null'
    # self.pidfile_path =  '/var/run/toidaemon/toidaemon.pid'
    self.pidfile_path = PID_FILE_PATH
    self.pidfile_timeout = 5
    # ========================= #

    self.db_name = DB_PATH
    self.db = SQLite(self.db_name)
    self.table = self.db.get(ToiScraper.TABLE_NAME)
    logger.info("Initializing...")
    if not self.table:
      logger.info("No table found with name {0}. Creating it.".format(ToiScraper.TABLE_NAME))
      self.table = self.db.create(ToiScraper.TABLE_NAME, ToiScraper.TABLE_SCHEMA)
    else:
      if not self.table.get_info() == ToiScraper.TABLE_SCHEMA:
        error_str = "Table {0} exists but with incorrect schema".format(ToiScraper.TABLE_NAME)
        logger.error(error_str)
        raise Exception(error_str)
    self.iter_date = self._get_init_date_full()

  # Get the last date in the database with at least 600 entries in it (enough to tell that it's full)
  def _get_init_date_full(self):
    logger.info("Retrieving last retrieved date from database with at least {0} in it".format(ToiScraper.MIN_ENTRIES))
    first_date = self.db.execute("""
        SELECT
          a.ds,
          a.count
        FROM (
          SELECT
            ds,
            count(1) AS count
          FROM {0}
          GROUP BY ds
          ORDER BY DATE(ds) DESC
        ) a
        WHERE a.count > {1}
        LIMIT 1;
      """.format(ToiScraper.TABLE_NAME, ToiScraper.MIN_ENTRIES),
      get=True
    )
    if len(first_date) == 0:
      logger.info("No last date with given minimum entries found in DB, starting from beginning.")
      return ToiScraper.INIT_DATE
    logger.info("Last date with entries {0} found. {1} entries total.".format(first_date[0][0], first_date[0][1]))
    return self.get_next_day(*tuple(map(int, first_date[0][0].split('-'))))


  # Get the last date in the database with entries in it
  def _get_init_date(self):
    logger.info("Retrieving last retrieved date from database")
    first_date = self.db.execute('SELECT ds FROM {0} ORDER BY DATE(ds) DESC LIMIT 1'.format(ToiScraper.TABLE_NAME), get=True)
    if len(first_date) == 0:
      logger.info("No last date found in DB, starting from beginning.")
      return ToiScraper.INIT_DATE
    logger.info("Last date {0} found.".format(first_date[0]['ds']))
    return self.get_next_day(*tuple(map(int, first_date[0]['ds'].split('-'))))

  def get_last_valid_date(self):
    return datetime.utcnow() + timedelta(hours=5, minutes=30)

  # Check if the date is strictly before today in IST
  def is_valid_date(self, year, month, day):
    try:
      datetime(year, month, day)
    except ValueError:
      return False
    cur_time = datetime(year, month, day)
    india_time = self.get_last_valid_date()
    return cur_time + timedelta(days = 1) < india_time and cur_time >= datetime(*ToiScraper.INIT_DATE)

  def compute_url_for_day(self, year, month, day):
    if not self.is_valid_date(year, month, day):
      return None
    # Day count used in TOI URL (1st October, 2015 == 42278)
    day_count = (date(year, month, day) - date(1900, 1, 1)).days + 2
    return "http://timesofindia.indiatimes.com/{year}/{month}/{day}/archivelist/year-{year},month-{month},starttime-{daycount}.cms".format(
        year = year,
        month = month,
        day = day,
        daycount = day_count
      )

  def get_next_day(self, year, month, day):
    next_day = datetime(year, month, day) + timedelta(days = 1)
    return (next_day.year, next_day.month, next_day.day)

  def _retrieve_url_contents(self, url, datetuple):
    logger.debug("Request sent to url {0}".format(url))
    req = requests.get(url)
    logger.debug("Response retrieved, parsing")
    soup = BeautifulSoup(req.text, 'lxml')
    # Signature of the element we're interested in. We rely on the TOI webpage
    # not to change
    divs = soup.find_all('div', style='font-family:arial ;font-size:12;font-weight:bold; color: #006699')
    if not len(divs) == 1:
      error_str = "Found {0} divs matching signature. Aborting.".format(len(divs))
      self.error(error_str)
      raise Exception(error_str)
    articles = divs[0].find_all('a')
    logger.debug("Found {0} hyperlinks in the archive.".format(len(articles)))
    articles = [a for a in articles if len(a.text) > 0]
    res = []
    titles = set({})
    for art in articles:
      corr_url = self.validate_url(art['href'])
      if corr_url:
        if art.text in titles:
          continue
        titles.add(art.text)
        res.append([
          datetime(*datetuple).strftime('%Y-%m-%d'),
          art.text,
          corr_url,
        ])
    logger.debug("Finished parsing, {0} rows remain".format(len(res)))
    return res

  # TOI specific article URL validation and correction
  def validate_url(self, url):
    URL_CORRECT = 'http://timesofindia.indiatimes.com/'
    URL_STANDARD = 'http://'
    URL_INSIDE = '.indiatimes.com/'
    if not url.startswith(URL_STANDARD) or not URL_INSIDE in url:
      if not url.endswith('.cms') or 'http' in url or ' ' in url:
        return None
      else:
        return URL_CORRECT + url
    return url

  def dedup_insert(self, data, ds):
    date_str = '-'.join(map(str, ds))
    logger.debug("Asking to insert {0} articles in {1}".format(len(data), date_str))
    rows = self.table.where({'ds': date_str})
    logger.debug("Already {0} rows exist in {1}".format(len(rows), date_str))
    titles = set({})
    res = []
    for a in rows:
      if not a['title'] in titles:
        titles.add(a['title'])
        res.append((a['ds'], a['title'], a['url']))
    for r in data:
      if not r[1] in titles:
        titles.add(r[1])
        res.append(r)
    logger.debug("{0} rows left after deduplicating".format(len(res)))
    if len(rows) > 0:
      logger.info("Deleting {0} rows from {1}".format(len(rows), date_str))
      self.table.del_where({'ds': date_str})
    if len(res) > 0:
      logger.info("Inserting {0} rows from {1}".format(len(res), date_str))
      self.table.insert(res)

  def get_articles_for_day(self, year, month, day):
    logger.debug("Getting articles for the day")
    url = self.compute_url_for_day(year, month, day)
    if not url:
      return 0
    data = self._retrieve_url_contents(url, (year, month, day))
    self.dedup_insert(data, (year, month, day))
    return len(data)

  def run(self):
    while True:
      while not self.is_valid_date(*self.iter_date):
        next_date = datetime(*self.iter_date) + timedelta(days=1)
        sec_to_next_date = (next_date - self.get_last_valid_date()).seconds
        logger.info("Reached the end, {0} seconds until {1}".format(sec_to_next_date, datetime(*self.iter_date).strftime('%Y-%m-%d')))
        if sec_to_next_date <= ToiScraper.MAX_SLEEP:
          time.sleep(sec_to_next_date)
        else:
          logger.info('Seconds till next day {0} greater than {1}, so only sleeping for {1}'.format(sec_to_next_date, ToiScraper.MAX_SLEEP))
          time.sleep(ToiScraper.MAX_SLEEP)
        logger.info('Woken up, getting init date again')
        self.iter_date = self._get_init_date_full()
        logger.info('New date set to {0}'.format(self.iter_date))
      logger.info("Retrieving articles for date {0}".format(self.iter_date))
      num_rows = self.get_articles_for_day(*self.iter_date)
      logger.info("Retrieved {0} rows from TOI".format(num_rows))
      if num_rows == 0:
        logger.debug("Sleeping for 10 seconds, no rows retrieved")
        time.sleep(10)
      else:
        self.iter_date = self.get_next_day(*self.iter_date)
        logger.debug("Iterated to next day - {0}".format(datetime(*self.iter_date)))
Example #18
0
 def __init__(self):
     self.sqlite = SQLite.get('databases/rxnorm.db')
class ToiScraper():
    TABLE_NAME = 'articles'
    TABLE_SCHEMA = [(u'ds', u'text(10)'), (u'title', u'text'),
                    (u'url', u'text')]
    # Manually observed minimum date on TOI
    INIT_DATE = (2020, 1, 1)
    MIN_ENTRIES = 600
    MAX_SLEEP = 3600

    def __init__(self):
        # ====  Required vars ===== #
        self.stdin_path = '/dev/null'
        self.stdout_path = '/dev/null'
        self.stderr_path = '/dev/null'
        # self.pidfile_path =  '/var/run/toidaemon/toidaemon.pid'
        self.pidfile_path = PID_FILE_PATH
        self.pidfile_timeout = 5
        # ========================= #

        self.db_name = DB_PATH
        self.db = SQLite(self.db_name)
        self.table = self.db.get(ToiScraper.TABLE_NAME)
        print("Initializing...")
        if not self.table:
            print("No table found with name {0}. Creating it.".format(
                ToiScraper.TABLE_NAME))
            self.table = self.db.create(ToiScraper.TABLE_NAME,
                                        ToiScraper.TABLE_SCHEMA)
        else:
            if not self.table.get_info() == ToiScraper.TABLE_SCHEMA:
                error_str = "Table {0} exists but with incorrect schema".format(
                    ToiScraper.TABLE_NAME)
                print(error_str)
                raise Exception(error_str)
        self.iter_date = self._get_init_date_full()

    # Get the last date in the database with at least 600 entries in it (enough to tell that it's full)
    def _get_init_date_full(self):
        print(
            "Retrieving last retrieved date from database with at least {0} in it"
            .format(ToiScraper.MIN_ENTRIES))
        first_date = self.db.execute("""
        SELECT
          a.ds,
          a.count
        FROM (
          SELECT
            ds,
            count(1) AS count
          FROM {0}
          GROUP BY ds
          ORDER BY DATE(ds) DESC
        ) a
        WHERE a.count > {1}
        LIMIT 1;
      """.format(ToiScraper.TABLE_NAME, ToiScraper.MIN_ENTRIES),
                                     get=True)
        if len(first_date) == 0:
            print(
                "No last date with given minimum entries found in DB, starting from beginning."
            )
            return ToiScraper.INIT_DATE
        print("Last date with entries {0} found. {1} entries total.".format(
            first_date[0][0], first_date[0][1]))
        return self.get_next_day(*tuple(map(int, first_date[0][0].split('-'))))

    # Get the last date in the database with entries in it
    def _get_init_date(self):
        print("Retrieving last retrieved date from database")
        first_date = self.db.execute(
            'SELECT ds FROM {0} ORDER BY DATE(ds) DESC LIMIT 1'.format(
                ToiScraper.TABLE_NAME),
            get=True)
        if len(first_date) == 0:
            print("No last date found in DB, starting from beginning.")
            return ToiScraper.INIT_DATE
        print("Last date {0} found.".format(first_date[0]['ds']))
        return self.get_next_day(
            *tuple(map(int, first_date[0]['ds'].split('-'))))

    def get_last_valid_date(self):
        return datetime.utcnow() + timedelta(hours=5, minutes=30)

    # Check if the date is strictly before today in IST
    def is_valid_date(self, year, month, day):
        try:
            datetime(year, month, day)
        except ValueError:
            return False
        cur_time = datetime(year, month, day)
        india_time = self.get_last_valid_date()
        return cur_time + timedelta(
            days=1) < india_time and cur_time >= datetime(
                *ToiScraper.INIT_DATE)

    def compute_url_for_day(self, year, month, day):
        if not self.is_valid_date(year, month, day):
            return None
        # Day count used in TOI URL (1st October, 2015 == 42278)
        day_count = (date(year, month, day) - date(1900, 1, 1)).days + 2
        return "http://timesofindia.indiatimes.com/{year}/{month}/{day}/archivelist/year-{year},month-{month},starttime-{daycount}.cms".format(
            year=year, month=month, day=day, daycount=day_count)

    def get_next_day(self, year, month, day):
        next_day = datetime(year, month, day) + timedelta(days=1)
        return (next_day.year, next_day.month, next_day.day)

    def _retrieve_url_contents(self, url, datetuple):
        print("Request sent to url {0}".format(url))
        req = requests.get(url)
        print("Response retrieved, parsing")
        soup = BeautifulSoup(req.text, 'lxml')
        # Signature of the element we're interested in. We rely on the TOI webpage
        # not to change
        divs = soup.find_all(
            'div',
            style=
            'font-family:arial ;font-size:12;font-weight:bold; color: #006699')
        if not len(divs) == 1:
            error_str = "Found {0} divs matching signature. Aborting.".format(
                len(divs))
            self.error(error_str)
            raise Exception(error_str)
        articles = divs[0].find_all('a')
        print("Found {0} hyperlinks in the archive.".format(len(articles)))
        articles = [a for a in articles if len(a.text) > 0]
        res = []
        titles = set({})
        for art in articles:
            corr_url = self.validate_url(art['href'])
            if corr_url:
                if art.text in titles:
                    continue
                titles.add(art.text)
                res.append([
                    datetime(*datetuple).strftime('%Y-%m-%d'),
                    art.text,
                    corr_url,
                ])
        print("Finished parsing, {0} rows remain".format(len(res)))
        return res

    # TOI specific article URL validation and correction
    def validate_url(self, url):
        URL_CORRECT = 'http://timesofindia.indiatimes.com/'
        URL_STANDARD = 'http://'
        URL_INSIDE = '.indiatimes.com/'
        if not url.startswith(URL_STANDARD) or not URL_INSIDE in url:
            if not url.endswith('.cms') or 'http' in url or ' ' in url:
                return None
            else:
                return URL_CORRECT + url
        return url

    def dedup_insert(self, data, ds):
        date_str = '-'.join(map(str, ds))
        print("Asking to insert {0} articles in {1}".format(
            len(data), date_str))
        rows = self.table.where({'ds': date_str})
        print("Already {0} rows exist in {1}".format(len(rows), date_str))
        titles = set({})
        res = []
        for a in rows:
            if not a['title'] in titles:
                titles.add(a['title'])
                res.append((a['ds'], a['title'], a['url']))
        for r in data:
            if not r[1] in titles:
                titles.add(r[1])
                res.append(r)
        print("{0} rows left after deduplicating".format(len(res)))
        if len(rows) > 0:
            print("Deleting {0} rows from {1}".format(len(rows), date_str))
            self.table.del_where({'ds': date_str})
        if len(res) > 0:
            print("Inserting {0} rows from {1}".format(len(res), date_str))
            self.table.insert(res)

    def get_articles_for_day(self, year, month, day):
        print("Getting articles for the day")
        url = self.compute_url_for_day(year, month, day)
        if not url:
            return 0
        data = self._retrieve_url_contents(url, (year, month, day))
        self.dedup_insert(data, (year, month, day))
        return len(data)

    def run(self):
        while True:
            while not self.is_valid_date(*self.iter_date):
                next_date = datetime(*self.iter_date) + timedelta(days=1)
                sec_to_next_date = (next_date -
                                    self.get_last_valid_date()).seconds
                print("Reached the end, {0} seconds until {1}".format(
                    sec_to_next_date,
                    datetime(*self.iter_date).strftime('%Y-%m-%d')))
                if sec_to_next_date <= ToiScraper.MAX_SLEEP:
                    time.sleep(sec_to_next_date)
                else:
                    print(
                        'Seconds till next day {0} greater than {1}, so only sleeping for {1}'
                        .format(sec_to_next_date, ToiScraper.MAX_SLEEP))
                    time.sleep(ToiScraper.MAX_SLEEP)
                print('Woken up, getting init date again')
                self.iter_date = self._get_init_date_full()
                print('New date set to {0}'.format(self.iter_date))
            print("Retrieving articles for date {0}".format(self.iter_date))
            num_rows = self.get_articles_for_day(*self.iter_date)
            print("Retrieved {0} rows from TOI".format(num_rows))
            if num_rows == 0:
                print("Sleeping for 10 seconds, no rows retrieved")
                time.sleep(10)
            else:
                self.iter_date = self.get_next_day(*self.iter_date)
                print("Iterated to next day - {0}".format(
                    datetime(*self.iter_date)))
Example #20
0
 def __init__(self):
     self.sqlite = SQLite.get('databases/umls.db')
Example #21
0
 def __init__(self):
     self.sqlite = SQLite.get(SNOMED.database_path())
Example #22
0
	def sqlite_assure_handle(cls):
		if cls.sqlite_handle is None:
			cls.sqlite_handle = SQLite.get(cls.sqlite_default_db)