augment_ids = [] ids = sorted(ids) for nid in range(ids[0] - 3, ids[len(ids)-1] + 3): augment_ids.append(nid) docsents = [] #for d in docs: # idsql = 'SELECT sentence from esl_sentences where doc=%s;' # cur2.execute(idsql, (d,)) # print d # docsents += [row[0] for row in cur2.fetchall()] for i in augment_ids: idsql = 'SELECT sentence from esl_sentences where id=%s;' cur2.execute(idsql, (i,)) # print i docsents += [row[0] for row in cur2.fetchall()] # control = controls.get_raw_control(sents, candidates, dfs) print len(docsents) b = controls.best_control(docsents, candidates, dfs) print b for s in sents: outfile.write(s+'\n') outfile.write("Control: "+b+'\n') outfile.write('\n') conn.close() logging.info("esl hit creation pipeline - FINISH")
for item in batchiter: sents.append(item) doc = item[6] sentnums.append(item[4]) #Insert 1 control sentence for s in sents: sentfile.write(str(s[0]) + ' ') sentfile.write('\n') justsents = [s[1] for s in sents] topwords = controls.get_topn(justsents, dfs, 5) querywords = [w for w, freq in topwords] cachefile = 'caches/controls.cache.'+str(datetime.datetime.now()).replace('\s', "-") #candidatecache = controls.pull_all_candidates_goog_cacheresults(querywords, cachefile) candidates = controls.pull_all_candidates_from_cache(querywords) if(len(candidates) > 0): b = controls.best_control(justsents, candidates, dfs) for j in justsents: print j sentfile.write(j+'\n') sentfile.write(str(b[0][0])+'\n\n') print b[0] conn.commit() conn.close() if(args.reload): outfile.close() logging.info("esl hit creation pipeline - FINISH")
sentcounts.append(hit_id) logging.info("Batch "+str(check)+" added") check+=1 sents = [] sentids = [] for item in batchiter: doc_id = item[4] candidates = controls.pull_candidates(doc_id.split('_')[0]) idsql = 'SELECT sentence from esl_sentences where doc_id=%s;' cur2.execute(idsql, (doc_id,)) sents.append(cur2.fetchone()[0]) sentids.append(doc_id) if(len(candidates) > 0): i = 0 b = controls.best_control(sents, candidates, dfs, nbest=5) for bb in b: bbuni = controls.touni(bb[0]) newb = generrors.randerr(bb[0]) cid = controls.insert_into_db(hit_id, newb, cur2) if(not(cid == -1)): outfile.write(bbuni+'\t') sql="INSERT INTO esl_hits_data(hit_id,esl_sentence_id,language_id,sentence_num)VALUES(%s,%s,%s,%s);" cur2.execute(sql,(hit_id, cid, lang_id, i)) outfile.write('\n') conn.commit() else: check = 0 cachedsents = codecs.open('controls.log.bk', encoding='utf-8', mode='r') for hit in cachedsents.readlines():