def test3(): from programmingalpha.DataSet.DBLoader import MongoStackExchange from programmingalpha.Utility.TextPreprocessing import PreprocessPostContent processor=PreprocessPostContent() db=MongoStackExchange(host='10.1.1.9',port=50000) dbName='stackoverflow' db.useDB(dbName) count=0 threshold=0.2 verbose=0 for q in db.questions.find().batch_size(10000): txt=q['Title']+q['Body'] codes=' '.join(processor.getCodeSnippets(txt)) if len(codes) and verbose<10: print(len(codes),len(txt)) verbose+=1 if len(codes)/len(txt)>threshold: count+=1 print("code question is {}/{}".format(count,db.questions.count())) count=0 for ans in db.answers.find().batch_size(10000): txt=ans['Body'] codes=' '.join(processor.getCodeSnippets(txt)) if len(codes) and verbose<10: print(len(codes),len(txt)) verbose+=1 if len(codes)/len(txt)>threshold: count+=1 print('code answer is {}/{}'.format(count,db.answers.count()))
def buildGraph(dbName): db = MongoStackExchange(host='10.1.1.9', port=50000) db.useDB(dbName) links = db.stackdb.get_collection("postlinks") allLinks = list(links.find().batch_size(args.batch_size)) myG = {} for link in tqdm.tqdm(allLinks, desc="building graph from links"): id_a, id_b = link["PostId"], link["RelatedPostId"] r = link["LinkTypeId"] if r == 3: w = 0 elif r == 1: w = 1 else: raise ValueError("unexpected value {} for link type".format(r)) if id_a in myG: myG[id_a][id_b] = w else: myG[id_a] = {id_b: w} if id_b in myG: myG[id_b][id_a] = w else: myG[id_b] = {id_a: w} logger.info("finished finding {} sublinks".format(len(allLinks))) return myG
def init(tokenizer_class): global PROCESS_TOK, PROCESS_DB PROCESS_TOK = tokenizer_class() Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) PROCESS_DB=MongoStackExchange(host='10.1.1.9',port='36666') PROCESS_DB.useDB(dbName) Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
def buildGraph(dbName): db=MongoStackExchange(host='10.1.1.9',port=50000) db.useDB(dbName) links=db.stackdb.get_collection("postlinks") allLinks=list(links.find().batch_size(args.batch_size)) G=nx.Graph() myG={} for link in tqdm.tqdm(allLinks,desc="building graph from links"): id_a,id_b=link["PostId"],link["RelatedPostId"] r=link["LinkTypeId"] if r==3: w=0 elif r==1: w=1 else: raise ValueError("unexpected value {} for link type".format(r)) G.add_edge(id_a,id_b,weight=w) if id_a in myG: myG[id_a][id_b]=w else: myG[id_a]={id_b:w} if id_b in myG: myG[id_b][id_a]=w else: myG[id_b]={id_a:w} logger.info("finished finding {} sublinks".format(len(allLinks))) logger.info("graph size of edges({}) and nodes({})".format(len(list(G.edges)),len(list(G.nodes)))) if len(G.nodes)<1e+4: return [G],G else: logger.info("cutting graph into small blocks") graphs=[] for cc in nx.connected_components(G): g=G.subgraph(cc) graphs.append(g) graphs.sort(key=lambda g:len(g.nodes),reverse=True) logger.info("num of subGs:{}".format(len(graphs))) subnodes=list(map(lambda g:len(g.nodes),graphs))[:10] logger.info("nodes of subG(top10):{}".format(subnodes)) return graphs,G
def genResults(): Qids = readQueryId("../../dataCases/query_list.txt") Summy = {} i = 0 for i in range(100): Summy[Qids[i]] = readSummary("../../dataCases/Summary_list/%d.txt" % i) print(len(Qids), len(Summy)) print(Summy) Answers = {} processor = PreprocessPostContent() docDB = MongoStackExchange(host="10.1.1.9", port=50000) docDB.useDB("stackoverflow") for qid in Qids: question = docDB.questions.find_one({"Id": qid}) if not question: print("None Error", qid, question) continue #print(question) if "AcceptedAnswerId" in question and question["AcceptedAnswerId"]: ans = docDB.answers.find_one({"Id": question["AcceptedAnswerId"] })["Body"] else: answers = docDB.answers.find({"ParentId": qid}) answers = list(answers) if len(answers) < 1: print("Error!", qid) continue answers.sort(key=lambda x: x["Score"], reverse=True) ans = answers[0]["Body"] ans = processor.getPlainTxt(ans) ans = " ".join(ans) Answers[qid] = {"true": ans, "generated": Summy[qid]} print(len(Answers), Answers[qid]) #break with open("../../dataCases/answers.json", "w") as f: import json json.dump(Answers, f)
questionsDataGlobal, ansIdxGlobal = fetchQuestionData(needed_qids) answersDataGlobal = fetchAnswerData(ansIdxGlobal, questionsDataGlobal.keys()) questionsDataGlobal.update(unsolvedQuestionGlobal) generateContextAnswerCorpusParallel(distance_dataNew, questionsDataGlobal, answersDataGlobal) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--batch_size', type=int, default=100) parser.add_argument('--samples', type=int, default=2000) parser.add_argument('--db', type=str, default="stackoverflow") parser.add_argument('--contextLen', type=int, default=1200) parser.add_argument('--questionLen', type=int, default=150) parser.add_argument('--lose_rate', type=float, default=0.5) parser.add_argument('--extractor', type=str, default="lexrankS") parser.add_argument('--workers', type=int, default=32) args = parser.parse_args() docDB = MongoStackExchange(host='10.1.1.9', port=50000) dbName = args.db docDB.useDB(dbName) main()
"TagSeeds.json")["Tags"] if not m_tag: m_tag = [ "<keras>", "<tensorflow>", "<caffe>", "<pytorch>", "<artificial-intelligence>", "<nlp>", "<computer-vision>", "<deep-learning>", "<neural-network>", "<machine-learning>", "<reinforcement-learning>", "<scikit-learn>" ] print("search init with %d tag seeds" % len(m_tag)) minSuppport = 1000 tuneMaxClipNum = None mongodb = MongoStackExchange(args.mongodb) dbname = "stackoverflow" tagCounter = TagCounter(mongodb, dbname) tagCounter.ItemSeeds.update(m_tag) #mine fp tree frequentItems = getFrequentItems() print("get %d frequent patterns" % len(frequentItems), "below are the frequent patterns of seeds") ''' seedCounter=tagCounter.getTagCounter(m_tag) for i in range(len(m_tag)): for j in range(1,len(m_tag)): tag1,tag2=m_tag[i],m_tag[j] tagP=frozenset({tag1,tag2})
from programmingalpha.Utility.TextPreprocessing import PreprocessPostContent import json if __name__ == '__main__': from programmingalpha.DataSet.DBLoader import MongoStackExchange db = MongoStackExchange("mongodb://10.1.1.9") AIQA = db.stackdb["QAPForAI"] data = [] with open("testdata/quetions.txt", "w") as f: for x in AIQA.find().batch_size(100): txt = x["question_title"] + " " + x["question_body"] processTxt = PreprocessPostContent() processTxt.raw_txt = txt result1, result2, result3 = processTxt.getEmCodes( ), processTxt.getCodeSnippets(), processTxt.getPlainTxt() data.append( json.dumps({ "emcodes": result1, "snippets": result2, "plaintxt": result3 }) + "\n") if len(data) % 1000 == 0: f.writelines(data) data.clear() if len(data) > 0:
def initDB(dbName): db = MongoStackExchange(host='10.1.1.9', port=50000) db.useDB(dbName) return db
with open(seq2seq_sample_file_dst,"w") as f: f.writelines(dataDst) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--batch_size', type=int, default=1000) parser.add_argument('--db', type=str, default="corpus") parser.add_argument('--maxSize', type=int, default=-1) parser.add_argument('--task', type=str, default="seq2seq") parser.add_argument('--contextLen', type=int, default=312) parser.add_argument('--questionLen', type=int, default=100) parser.add_argument('--answerLen', type=int, default=100) args = parser.parse_args() docDB=MongoStackExchange(host='10.1.1.9',port=50000) docDB.useDB(args.db) if args.task=="inference": logger.info("task is "+args.task) inferenceGen() if args.task=="seq2seq": logger.info("task is "+args.task) seq2seqGen() if args.task=="knowNet": logger.info("task is "+args.task) knowNetGen()
if record is not None: cache.append(json.dumps(record) + "\n") if len(cache) > args.batch_size: f.writelines(cache) cache.clear() f.flush() if len(cache) > 0: f.writelines(cache) cache.clear() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--batch_size', type=int, default=10) parser.add_argument('--relative_num', type=int, default=5) parser.add_argument('--target_answer_len', type=int, default=100) parser.add_argument('--answer_len', type=int, default=80) parser.add_argument('--question_len', type=int, default=80) args = parser.parse_args() docDB = MongoStackExchange(host='10.1.1.9', port=50000) docDB.useDB("posts") processData()
from programmingalpha.DataSet.DBLoader import MongoStackExchange from tqdm import tqdm import pickle import numpy as np from matplotlib import pyplot docDB=MongoStackExchange(host='10.1.1.9',port=50000) def countAnswerLen(): docDB.useDB('corpus') seq2seq=docDB.stackdb['seq2seq'] answerLength=[] retrictedLen=[] for record in tqdm(seq2seq.find().batch_size(10000),desc="retrieving seq2seq record"): ansL=len(" ".join(record["answer"]).split()) answerLength.append(ansL) if ansL<=200: retrictedLen.append(ansL) answerLength.sort() avg=np.mean(answerLength) std=np.std(answerLength) hist=np.histogram(answerLength) print(avg,std,len(answerLength),len(retrictedLen)) print(hist) x=np.arange(len(answerLength))