def test(): language = sys.argv[1] CHUNK_SIZE = 10 cursor = Documents.conn.cursor() while True: documents = MDocuments() LIMIT = CHUNK_SIZE * 5 documents.get_multi(limit = LIMIT, where="""id IN (SELECT id FROM documents_to_affected WHERE language='%s' LIMIT %d)""" % (language, LIMIT)) docs = [Documents(**d) for d in documents.value()] if len(docs) == 0: print "NO DOCUMENTS!\nSLEEPING." timer.sleep_minute( 120 ) continue c = 0 for documents in chunks( docs, CHUNK_SIZE ): affect_docs( documents, language, cursor )
def test(): language = sys.argv[1] CHUNK_SIZE = 10 cursor = Documents.conn.cursor() while True: documents = MDocuments() LIMIT = CHUNK_SIZE * 5 documents.get_multi( limit=LIMIT, where="""id IN (SELECT id FROM documents_to_affected WHERE language='%s' LIMIT %d)""" % (language, LIMIT), ) docs = [Documents(**d) for d in documents.value()] if len(docs) == 0: print "NO DOCUMENTS!\nSLEEPING." timer.sleep_minute(120) continue c = 0 for documents in chunks(docs, CHUNK_SIZE): affect_docs(documents, language, cursor)
conn = get_connection(UNICODE=True) # conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) print "Analyzing whole database" sys.stdout.flush() while True: curr = conn.cursor() curr.execute( """SELECT id, language, text FROM documents WHERE termvector is null and language in ('en', 'de') and pubdate>='01-05-2011' and pubdate<'01-07-2011' LIMIT %s""", (LIMIT,), ) # curr.execute("""SELECT id, text FROM documents WHERE id=20875243""") count = 0 for document in curr: id, lang, text = document print "id=%s" % id termvector = get_termvector(text, lang, conn) insertcurr = conn.cursor() insertcurr.execute( """UPDATE documents SET termvector=%s WHERE id=%s""", (termvector, id), ) count += 1 sys.stdout.flush() if count == 0: print "SLEEPING" timer.sleep_minute(60)
def main(): conn = get_connection(UNICODE=True) curr = conn.cursor() tokenizer = TreebankWordTokenizer() while True: curr.execute("""SELECT id, text, language FROM documents WHERE --guid='tw:122144569302323201' EXISTS ( SELECT 1 FROM instances WHERE item_id=documents.id AND begintoken IS NULL) LIMIT 1""") data = curr.fetchone() if data is None: print "sleep" timer.sleep_minute(30) continue id, text, lang = data print "id", id curr.execute("""SELECT * FROM instances WHERE item_id = %s AND begintoken IS NULL""", (id,)) # throw away `confidence` instances = [list(x)[:-1] for x in curr] if not len(instances): continue instance_ = [] for ins in instances: ins[-1] = None ins[-2] = None ins[-3] = None instance_.append(ins) instances = instance_ #print instances sent_tok = PunktSentenceTokenizer() for sid, sentidx in enumerate(sent_tok.span_tokenize(text)): #print '++++' sentence = text[sentidx[0]:sentidx[1]] #print sentence #print '----' for pos, indexes in enumerate(WhitespaceTokenizer().span_tokenize(sentence)): # TODO indexy jsou pouze relativni k vete # ale instances je ma od zacatku! indexes = list(indexes) indexes[0] = sentidx[0] + indexes[0] indexes[1] = sentidx[0] + indexes[1] word = text[indexes[0]:indexes[1]] #print pos, word, indexes for i, instance in enumerate(instances): id, entity_id, item_id, exact, offset, length, sid_, begin, end =instance #print i,instance if sid_ is None: if begin is None: if offset >= indexes[0] and offset <= indexes[1]: instances[i][-2] = begin = pos instances[i][-3] = sid_ = sid if sid_ == sid: if end is None and begin is not None: off = offset + length if off <= indexes[1] and off >= indexes[0]: instances[i][-1] = pos if off == indexes[0]: instances[i][-1] = pos - 1 for instance in instances: print instance id, entity_id, item_id, exact, offset, length, sid, begin, end =instance #print exact, ">>", sid, begin, end if end is None: if not " " in exact: end = begin else: end = -1 curr.execute("""UPDATE instances SET sid=%s, begintoken=%s, endtoken=%s WHERE id=%s""", (sid, begin, end, id))
def main(): conn = get_connection(UNICODE=True) curr = conn.cursor() tokenizer = TreebankWordTokenizer() while True: curr.execute("""SELECT id, text, language FROM documents WHERE --guid='tw:122144569302323201' EXISTS ( SELECT 1 FROM instances WHERE item_id=documents.id AND begintoken IS NULL) LIMIT 1""") data = curr.fetchone() if data is None: print "sleep" timer.sleep_minute(30) continue id, text, lang = data print "id", id curr.execute( """SELECT * FROM instances WHERE item_id = %s AND begintoken IS NULL""", (id, )) # throw away `confidence` instances = [list(x)[:-1] for x in curr] if not len(instances): continue instance_ = [] for ins in instances: ins[-1] = None ins[-2] = None ins[-3] = None instance_.append(ins) instances = instance_ #print instances sent_tok = PunktSentenceTokenizer() for sid, sentidx in enumerate(sent_tok.span_tokenize(text)): #print '++++' sentence = text[sentidx[0]:sentidx[1]] #print sentence #print '----' for pos, indexes in enumerate( WhitespaceTokenizer().span_tokenize(sentence)): # TODO indexy jsou pouze relativni k vete # ale instances je ma od zacatku! indexes = list(indexes) indexes[0] = sentidx[0] + indexes[0] indexes[1] = sentidx[0] + indexes[1] word = text[indexes[0]:indexes[1]] #print pos, word, indexes for i, instance in enumerate(instances): id, entity_id, item_id, exact, offset, length, sid_, begin, end = instance #print i,instance if sid_ is None: if begin is None: if offset >= indexes[0] and offset <= indexes[1]: instances[i][-2] = begin = pos instances[i][-3] = sid_ = sid if sid_ == sid: if end is None and begin is not None: off = offset + length if off <= indexes[1] and off >= indexes[0]: instances[i][-1] = pos if off == indexes[0]: instances[i][-1] = pos - 1 for instance in instances: print instance id, entity_id, item_id, exact, offset, length, sid, begin, end = instance #print exact, ">>", sid, begin, end if end is None: if not " " in exact: end = begin else: end = -1 curr.execute( """UPDATE instances SET sid=%s, begintoken=%s, endtoken=%s WHERE id=%s""", (sid, begin, end, id))
if __name__ == "__main__": LIMIT = 20 conn = get_connection(UNICODE=True) #conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) print "Analyzing whole database" sys.stdout.flush() while True: curr = conn.cursor() curr.execute("""SELECT id, language, text FROM documents WHERE termvector is null and language in ('en', 'de') and pubdate>='01-05-2011' and pubdate<'01-07-2011' LIMIT %s""", (LIMIT ,)) #curr.execute("""SELECT id, text FROM documents WHERE id=20875243""") count = 0 for document in curr: id, lang, text = document print "id=%s" % id termvector = get_termvector(text, lang, conn) insertcurr = conn.cursor() insertcurr.execute("""UPDATE documents SET termvector=%s WHERE id=%s""", ( termvector, id )) count += 1 sys.stdout.flush() if count == 0: print "SLEEPING" timer.sleep_minute(60)