def buildGraph(dbName): db = MongoStackExchange(host='10.1.1.9', port=50000) db.useDB(dbName) links = db.stackdb.get_collection("postlinks") allLinks = list(links.find().batch_size(args.batch_size)) myG = {} for link in tqdm.tqdm(allLinks, desc="building graph from links"): id_a, id_b = link["PostId"], link["RelatedPostId"] r = link["LinkTypeId"] if r == 3: w = 0 elif r == 1: w = 1 else: raise ValueError("unexpected value {} for link type".format(r)) if id_a in myG: myG[id_a][id_b] = w else: myG[id_a] = {id_b: w} if id_b in myG: myG[id_b][id_a] = w else: myG[id_b] = {id_a: w} logger.info("finished finding {} sublinks".format(len(allLinks))) return myG
def test3(): from programmingalpha.DataSet.DBLoader import MongoStackExchange from programmingalpha.Utility.TextPreprocessing import PreprocessPostContent processor=PreprocessPostContent() db=MongoStackExchange(host='10.1.1.9',port=50000) dbName='stackoverflow' db.useDB(dbName) count=0 threshold=0.2 verbose=0 for q in db.questions.find().batch_size(10000): txt=q['Title']+q['Body'] codes=' '.join(processor.getCodeSnippets(txt)) if len(codes) and verbose<10: print(len(codes),len(txt)) verbose+=1 if len(codes)/len(txt)>threshold: count+=1 print("code question is {}/{}".format(count,db.questions.count())) count=0 for ans in db.answers.find().batch_size(10000): txt=ans['Body'] codes=' '.join(processor.getCodeSnippets(txt)) if len(codes) and verbose<10: print(len(codes),len(txt)) verbose+=1 if len(codes)/len(txt)>threshold: count+=1 print('code answer is {}/{}'.format(count,db.answers.count()))
def init(tokenizer_class): global PROCESS_TOK, PROCESS_DB PROCESS_TOK = tokenizer_class() Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) PROCESS_DB=MongoStackExchange(host='10.1.1.9',port='36666') PROCESS_DB.useDB(dbName) Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
def buildGraph(dbName): db=MongoStackExchange(host='10.1.1.9',port=50000) db.useDB(dbName) links=db.stackdb.get_collection("postlinks") allLinks=list(links.find().batch_size(args.batch_size)) G=nx.Graph() myG={} for link in tqdm.tqdm(allLinks,desc="building graph from links"): id_a,id_b=link["PostId"],link["RelatedPostId"] r=link["LinkTypeId"] if r==3: w=0 elif r==1: w=1 else: raise ValueError("unexpected value {} for link type".format(r)) G.add_edge(id_a,id_b,weight=w) if id_a in myG: myG[id_a][id_b]=w else: myG[id_a]={id_b:w} if id_b in myG: myG[id_b][id_a]=w else: myG[id_b]={id_a:w} logger.info("finished finding {} sublinks".format(len(allLinks))) logger.info("graph size of edges({}) and nodes({})".format(len(list(G.edges)),len(list(G.nodes)))) if len(G.nodes)<1e+4: return [G],G else: logger.info("cutting graph into small blocks") graphs=[] for cc in nx.connected_components(G): g=G.subgraph(cc) graphs.append(g) graphs.sort(key=lambda g:len(g.nodes),reverse=True) logger.info("num of subGs:{}".format(len(graphs))) subnodes=list(map(lambda g:len(g.nodes),graphs))[:10] logger.info("nodes of subG(top10):{}".format(subnodes)) return graphs,G
def genResults(): Qids = readQueryId("../../dataCases/query_list.txt") Summy = {} i = 0 for i in range(100): Summy[Qids[i]] = readSummary("../../dataCases/Summary_list/%d.txt" % i) print(len(Qids), len(Summy)) print(Summy) Answers = {} processor = PreprocessPostContent() docDB = MongoStackExchange(host="10.1.1.9", port=50000) docDB.useDB("stackoverflow") for qid in Qids: question = docDB.questions.find_one({"Id": qid}) if not question: print("None Error", qid, question) continue #print(question) if "AcceptedAnswerId" in question and question["AcceptedAnswerId"]: ans = docDB.answers.find_one({"Id": question["AcceptedAnswerId"] })["Body"] else: answers = docDB.answers.find({"ParentId": qid}) answers = list(answers) if len(answers) < 1: print("Error!", qid) continue answers.sort(key=lambda x: x["Score"], reverse=True) ans = answers[0]["Body"] ans = processor.getPlainTxt(ans) ans = " ".join(ans) Answers[qid] = {"true": ans, "generated": Summy[qid]} print(len(Answers), Answers[qid]) #break with open("../../dataCases/answers.json", "w") as f: import json json.dump(Answers, f)
questionsDataGlobal, ansIdxGlobal = fetchQuestionData(needed_qids) answersDataGlobal = fetchAnswerData(ansIdxGlobal, questionsDataGlobal.keys()) questionsDataGlobal.update(unsolvedQuestionGlobal) generateContextAnswerCorpusParallel(distance_dataNew, questionsDataGlobal, answersDataGlobal) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--batch_size', type=int, default=100) parser.add_argument('--samples', type=int, default=2000) parser.add_argument('--db', type=str, default="stackoverflow") parser.add_argument('--contextLen', type=int, default=1200) parser.add_argument('--questionLen', type=int, default=150) parser.add_argument('--lose_rate', type=float, default=0.5) parser.add_argument('--extractor', type=str, default="lexrankS") parser.add_argument('--workers', type=int, default=32) args = parser.parse_args() docDB = MongoStackExchange(host='10.1.1.9', port=50000) dbName = args.db docDB.useDB(dbName) main()
def initDB(dbName): db = MongoStackExchange(host='10.1.1.9', port=50000) db.useDB(dbName) return db
with open(seq2seq_sample_file_dst,"w") as f: f.writelines(dataDst) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--batch_size', type=int, default=1000) parser.add_argument('--db', type=str, default="corpus") parser.add_argument('--maxSize', type=int, default=-1) parser.add_argument('--task', type=str, default="seq2seq") parser.add_argument('--contextLen', type=int, default=312) parser.add_argument('--questionLen', type=int, default=100) parser.add_argument('--answerLen', type=int, default=100) args = parser.parse_args() docDB=MongoStackExchange(host='10.1.1.9',port=50000) docDB.useDB(args.db) if args.task=="inference": logger.info("task is "+args.task) inferenceGen() if args.task=="seq2seq": logger.info("task is "+args.task) seq2seqGen() if args.task=="knowNet": logger.info("task is "+args.task) knowNetGen()
if record is not None: cache.append(json.dumps(record) + "\n") if len(cache) > args.batch_size: f.writelines(cache) cache.clear() f.flush() if len(cache) > 0: f.writelines(cache) cache.clear() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--batch_size', type=int, default=10) parser.add_argument('--relative_num', type=int, default=5) parser.add_argument('--target_answer_len', type=int, default=100) parser.add_argument('--answer_len', type=int, default=80) parser.add_argument('--question_len', type=int, default=80) args = parser.parse_args() docDB = MongoStackExchange(host='10.1.1.9', port=50000) docDB.useDB("posts") processData()