if __name__ == '__main__': data = {} print "DATA FETCHING" for name, v in config.iteritems(): url = v[0] data[name] = load(url) print "\t%s : %d" % (name, len(data[name])) print print "DB INTERACT" conn = get_connection() logfile = open(LOG_FILE, 'a') logfile.write("=" * 80) logfile.write("\nSTART (" + str(timer.timestamp()) + ')\n') outfile = open(OUT_FOLDER + str(timer.timestamp()) + '.csv', 'wb') csv_writer = UnicodeWriter(outfile) results = {} for name, values in data.iteritems(): cols = config[name][1] cur = conn.cursor() cur.execute('select %s from %s' % (', '.join(cols), name)) in_db = cur.fetchall() cur_insert = conn.cursor() print "\ttable : %s" % name print "\t\titems in db : %d" % (len(in_db)) results[name] = 0 for value in values: if tuple(value) not in in_db:
if __name__ == '__main__': data = {} print "DATA FETCHING" for name, v in config.iteritems(): url = v[0] data[name] = load(url) print "\t%s : %d" % (name, len(data[name])) print print "DB INTERACT" conn = get_connection() logfile = open( LOG_FILE, 'a') logfile.write("="*80) logfile.write("\nSTART (" + str(timer.timestamp()) + ')\n') outfile = open( OUT_FOLDER + str(timer.timestamp()) + '.csv', 'wb') csv_writer = UnicodeWriter(outfile) results = {} for name, values in data.iteritems(): cols = config[name][1] cur = conn.cursor() cur.execute('select %s from %s' % (', '.join(cols), name)) in_db = cur.fetchall() cur_insert = conn.cursor() print "\ttable : %s" % name print "\t\titems in db : %d" % (len(in_db)) results[name] = 0 for value in values: if tuple(value) not in in_db:
def main(): # logging init logger = logging.getLogger("db_stahovak") logger.setLevel(logging.WARNING) # start infoo logger.info("START") # classifier tcl = TwitterClassifier() # get twitter's id's - only twitter should be classified conn = get_connection(); cursor = conn.cursor() cursor.execute("select id from sources_twitter") twitter_ids = [id[0] for id in cursor] while True: # feeds init # XXX - performance problems - sources should be before while... sources = MSources() sources.get_multi(where="_stahovak = true") feeds = [Sources(**data) for data in sources.value()] assert feeds items_count = 0 for source in feeds: logger.info("SOURCE\tSECTION:%s\tLINK:%s" % (source.get_section(), source.get_link())) modified = str2tuple(source.get_modified()) data = downloader.download(source.get_link(), source.get_etag(), modified) # update etag/modified if data['etag'] or data['modified']: diff = False if source.get_etag() != data['etag']: diff = True source.set_etag(data['etag']) if modified != data['modified']: diff = True source.set_modified(tuple2str(data['modified'])) if diff: source.update() classified_as_irelevant = 0 # work with items for item in data['items']: items_count += 1 # prepare new database insert Item = Documents() Item.set_timestamp(timer.timestamp()) Item.set_source_id(source.get_id()) Item.set_language(source.get_language()) Item.set_title(control_chars.remove(item['title'])) Item.set_text(control_chars.remove(item['text'])) try: Item.set_termvector(get_termvector( Item.get_text(), Item.get_language(), conn)) except psycopg2.ProgrammingError, e: print str(e) continue Item.set__relevance(None) # we classify only twitter's documents if ( source.get_id() in twitter_ids ): score = tcl.classify(Item.get_text(), Item.get_language()) was_classified = (score != -1) if (was_classified and score < MIN_SCORE): # skip classified_as_irelevant += 1 continue if ( was_classified ): Item.set__relevance(int(score * 100)) Item.set_link(control_chars.remove(item['link'])) Item.set_guid(source.get_section()+":"+control_chars.remove(item['guid'])) if item['pubDate']: pubDate = time.strftime("%Y-%m-%d", item['pubDate']) if pubDate : Item.set_pubDate(pubDate) pubTime = time.strftime("%H:%M:%S%z", item['pubDate']) if pubTime :Item.set_pubTime(pubTime) if not Item.get_pubDate(): # dont want items without pubdate continue ## following links if source.get__follow(): url = item['link'] logger.debug("Following LINK:%s", url) page = downloader.download_url(url) Item.set_text(control_chars.remove(page.get('text', ''))) Item.set_html_description(control_chars.remove(page.get('description', ""))) Item.set_html_keywords(control_chars.remove(page.get('keywords', ""))) # insert it if Item.get_text(): inserted, id = Item.insert() if inserted: logger.debug("Document succesfully inserted into db with id=%s" % Item.get_id()) yield str(id) # output else: logger.debug("Document already in db with id=%s" % id) else: logger.info("Item has not text!") # outputting logger.info("Created OUTPUT\tITEMS:%d\tIRELEVANT:%d", data['items_count'], classified_as_irelevant) if not items_count: print "going to sleep" timer.sleep_second(SLEEP_TIME)
def main(): # logging init logger = logging.getLogger("db_stahovak") logger.setLevel(logging.WARNING) # start infoo logger.info("START") # classifier tcl = TwitterClassifier() # get twitter's id's - only twitter should be classified conn = get_connection() cursor = conn.cursor() cursor.execute("select id from sources_twitter") twitter_ids = [id[0] for id in cursor] while True: # feeds init # XXX - performance problems - sources should be before while... sources = MSources() sources.get_multi(where="_stahovak = true") feeds = [Sources(**data) for data in sources.value()] assert feeds items_count = 0 for source in feeds: logger.info("SOURCE\tSECTION:%s\tLINK:%s" % (source.get_section(), source.get_link())) modified = str2tuple(source.get_modified()) data = downloader.download(source.get_link(), source.get_etag(), modified) # update etag/modified if data['etag'] or data['modified']: diff = False if source.get_etag() != data['etag']: diff = True source.set_etag(data['etag']) if modified != data['modified']: diff = True source.set_modified(tuple2str(data['modified'])) if diff: source.update() classified_as_irelevant = 0 # work with items for item in data['items']: items_count += 1 # prepare new database insert Item = Documents() Item.set_timestamp(timer.timestamp()) Item.set_source_id(source.get_id()) Item.set_language(source.get_language()) Item.set_title(control_chars.remove(item['title'])) Item.set_text(control_chars.remove(item['text'])) try: Item.set_termvector( get_termvector(Item.get_text(), Item.get_language(), conn)) except psycopg2.ProgrammingError, e: print str(e) continue Item.set__relevance(None) # we classify only twitter's documents if (source.get_id() in twitter_ids): score = tcl.classify(Item.get_text(), Item.get_language()) was_classified = (score != -1) if (was_classified and score < MIN_SCORE): # skip classified_as_irelevant += 1 continue if (was_classified): Item.set__relevance(int(score * 100)) Item.set_link(control_chars.remove(item['link'])) Item.set_guid(source.get_section() + ":" + control_chars.remove(item['guid'])) if item['pubDate']: pubDate = time.strftime("%Y-%m-%d", item['pubDate']) if pubDate: Item.set_pubDate(pubDate) pubTime = time.strftime("%H:%M:%S%z", item['pubDate']) if pubTime: Item.set_pubTime(pubTime) if not Item.get_pubDate(): # dont want items without pubdate continue ## following links if source.get__follow(): url = item['link'] logger.debug("Following LINK:%s", url) page = downloader.download_url(url) Item.set_text(control_chars.remove(page.get('text', ''))) Item.set_html_description( control_chars.remove(page.get('description', ""))) Item.set_html_keywords( control_chars.remove(page.get('keywords', ""))) # insert it if Item.get_text(): inserted, id = Item.insert() if inserted: logger.debug( "Document succesfully inserted into db with id=%s" % Item.get_id()) yield str(id) # output else: logger.debug("Document already in db with id=%s" % id) else: logger.info("Item has not text!") # outputting logger.info("Created OUTPUT\tITEMS:%d\tIRELEVANT:%d", data['items_count'], classified_as_irelevant) if not items_count: print "going to sleep" timer.sleep_second(SLEEP_TIME)
try: data = db_downloader.download_url(url) except Exception, e: print e return None if not data.get('text'): print 'db_url-stahovak: no text' return dbdoc = Documents() dbdoc.set_pubDate(pubdate) dbdoc.set_pubTime(pubtime) dbdoc.set_text( control_chars.remove(data['text']) ) dbdoc.set_title( control_chars.remove(data.get('title', '')) ) dbdoc.set_source_id(SOURCE_ID) dbdoc.set_language(u'en') dbdoc.set_timestamp(timer.timestamp()) dbdoc.set_link( url ) dbdoc.set_html_description( control_chars.remove(data.get('description', '')) ) dbdoc.set_html_keywords( control_chars.remove(data.get('keywords', '')) ) dbdoc.set_guid( GUID_PREFIX + ":" + hashlib.sha224(url).hexdigest() ) ok, id = dbdoc.insert() return id if __name__ == '__main__': """Download for each line in form 'url\tpubdate\tpubtime'""" for line in sys.stdin: line = line.strip()
try: data = db_downloader.download_url(url) except Exception, e: print e return None if not data.get('text'): print 'db_url-stahovak: no text' return dbdoc = Documents() dbdoc.set_pubDate(pubdate) dbdoc.set_pubTime(pubtime) dbdoc.set_text(control_chars.remove(data['text'])) dbdoc.set_title(control_chars.remove(data.get('title', ''))) dbdoc.set_source_id(SOURCE_ID) dbdoc.set_language(u'en') dbdoc.set_timestamp(timer.timestamp()) dbdoc.set_link(url) dbdoc.set_html_description( control_chars.remove(data.get('description', ''))) dbdoc.set_html_keywords(control_chars.remove(data.get('keywords', ''))) dbdoc.set_guid(GUID_PREFIX + ":" + hashlib.sha224(url).hexdigest()) ok, id = dbdoc.insert() return id if __name__ == '__main__': """Download for each line in form 'url\tpubdate\tpubtime'""" for line in sys.stdin: line = line.strip() if len(line) == 0: continue