コード例 #1
0
def buildGraph(dbName):
    db = MongoStackExchange(host='10.1.1.9', port=50000)
    db.useDB(dbName)

    links = db.stackdb.get_collection("postlinks")

    allLinks = list(links.find().batch_size(args.batch_size))

    myG = {}
    for link in tqdm.tqdm(allLinks, desc="building graph from links"):
        id_a, id_b = link["PostId"], link["RelatedPostId"]
        r = link["LinkTypeId"]
        if r == 3:
            w = 0
        elif r == 1:
            w = 1
        else:
            raise ValueError("unexpected value {} for link type".format(r))

        if id_a in myG:
            myG[id_a][id_b] = w
        else:
            myG[id_a] = {id_b: w}

        if id_b in myG:
            myG[id_b][id_a] = w
        else:
            myG[id_b] = {id_a: w}

    logger.info("finished finding {} sublinks".format(len(allLinks)))

    return myG
コード例 #2
0
def test3():
    from programmingalpha.DataSet.DBLoader import MongoStackExchange
    from programmingalpha.Utility.TextPreprocessing import PreprocessPostContent
    processor=PreprocessPostContent()
    db=MongoStackExchange(host='10.1.1.9',port=50000)
    dbName='stackoverflow'
    db.useDB(dbName)
    count=0
    threshold=0.2
    verbose=0
    for q in db.questions.find().batch_size(10000):
        txt=q['Title']+q['Body']
        codes=' '.join(processor.getCodeSnippets(txt))
        if len(codes) and verbose<10:
            print(len(codes),len(txt))
            verbose+=1

        if len(codes)/len(txt)>threshold:
            count+=1
    print("code question is {}/{}".format(count,db.questions.count()))

    count=0
    for ans in db.answers.find().batch_size(10000):
        txt=ans['Body']
        codes=' '.join(processor.getCodeSnippets(txt))
        if len(codes) and verbose<10:
            print(len(codes),len(txt))
            verbose+=1

        if len(codes)/len(txt)>threshold:
            count+=1
    print('code answer is {}/{}'.format(count,db.answers.count()))
コード例 #3
0
def init(tokenizer_class):
    global PROCESS_TOK, PROCESS_DB
    PROCESS_TOK = tokenizer_class()
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_DB=MongoStackExchange(host='10.1.1.9',port='36666')
    PROCESS_DB.useDB(dbName)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
コード例 #4
0
def buildGraph(dbName):
    db=MongoStackExchange(host='10.1.1.9',port=50000)
    db.useDB(dbName)

    links=db.stackdb.get_collection("postlinks")

    allLinks=list(links.find().batch_size(args.batch_size))

    G=nx.Graph()

    myG={}
    for link in tqdm.tqdm(allLinks,desc="building graph from links"):
        id_a,id_b=link["PostId"],link["RelatedPostId"]
        r=link["LinkTypeId"]
        if r==3:
            w=0
        elif r==1:
            w=1
        else:
            raise ValueError("unexpected value {} for link type".format(r))

        G.add_edge(id_a,id_b,weight=w)

        if id_a in myG:
            myG[id_a][id_b]=w
        else:
            myG[id_a]={id_b:w}

        if id_b in myG:
            myG[id_b][id_a]=w
        else:
            myG[id_b]={id_a:w}

    logger.info("finished finding {} sublinks".format(len(allLinks)))
    logger.info("graph size of edges({}) and nodes({})".format(len(list(G.edges)),len(list(G.nodes))))



    if len(G.nodes)<1e+4:
        return [G],G
    else:
        logger.info("cutting graph into small blocks")

    graphs=[]

    for cc in nx.connected_components(G):
        g=G.subgraph(cc)
        graphs.append(g)

    graphs.sort(key=lambda g:len(g.nodes),reverse=True)

    logger.info("num of subGs:{}".format(len(graphs)))
    subnodes=list(map(lambda g:len(g.nodes),graphs))[:10]
    logger.info("nodes of subG(top10):{}".format(subnodes))


    return graphs,G
コード例 #5
0
def genResults():
    Qids = readQueryId("../../dataCases/query_list.txt")
    Summy = {}
    i = 0
    for i in range(100):
        Summy[Qids[i]] = readSummary("../../dataCases/Summary_list/%d.txt" % i)

    print(len(Qids), len(Summy))
    print(Summy)

    Answers = {}
    processor = PreprocessPostContent()
    docDB = MongoStackExchange(host="10.1.1.9", port=50000)
    docDB.useDB("stackoverflow")
    for qid in Qids:
        question = docDB.questions.find_one({"Id": qid})
        if not question:
            print("None Error", qid, question)
            continue
        #print(question)
        if "AcceptedAnswerId" in question and question["AcceptedAnswerId"]:
            ans = docDB.answers.find_one({"Id": question["AcceptedAnswerId"]
                                          })["Body"]
        else:
            answers = docDB.answers.find({"ParentId": qid})
            answers = list(answers)
            if len(answers) < 1:
                print("Error!", qid)
                continue
            answers.sort(key=lambda x: x["Score"], reverse=True)
            ans = answers[0]["Body"]

        ans = processor.getPlainTxt(ans)
        ans = " ".join(ans)
        Answers[qid] = {"true": ans, "generated": Summy[qid]}
        print(len(Answers), Answers[qid])
        #break

    with open("../../dataCases/answers.json", "w") as f:
        import json
        json.dump(Answers, f)
コード例 #6
0
    questionsDataGlobal, ansIdxGlobal = fetchQuestionData(needed_qids)
    answersDataGlobal = fetchAnswerData(ansIdxGlobal,
                                        questionsDataGlobal.keys())

    questionsDataGlobal.update(unsolvedQuestionGlobal)
    generateContextAnswerCorpusParallel(distance_dataNew, questionsDataGlobal,
                                        answersDataGlobal)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', type=int, default=100)
    parser.add_argument('--samples', type=int, default=2000)

    parser.add_argument('--db', type=str, default="stackoverflow")
    parser.add_argument('--contextLen', type=int, default=1200)
    parser.add_argument('--questionLen', type=int, default=150)
    parser.add_argument('--lose_rate', type=float, default=0.5)

    parser.add_argument('--extractor', type=str, default="lexrankS")

    parser.add_argument('--workers', type=int, default=32)

    args = parser.parse_args()

    docDB = MongoStackExchange(host='10.1.1.9', port=50000)
    dbName = args.db
    docDB.useDB(dbName)

    main()
コード例 #7
0
def initDB(dbName):

    db = MongoStackExchange(host='10.1.1.9', port=50000)
    db.useDB(dbName)

    return db
コード例 #8
0
    with open(seq2seq_sample_file_dst,"w") as f:
        f.writelines(dataDst)


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', type=int, default=1000)
    parser.add_argument('--db', type=str, default="corpus")
    parser.add_argument('--maxSize', type=int, default=-1)
    parser.add_argument('--task', type=str, default="seq2seq")
    parser.add_argument('--contextLen', type=int, default=312)
    parser.add_argument('--questionLen', type=int, default=100)
    parser.add_argument('--answerLen', type=int, default=100)

    args = parser.parse_args()

    docDB=MongoStackExchange(host='10.1.1.9',port=50000)
    docDB.useDB(args.db)

    if args.task=="inference":
        logger.info("task is "+args.task)
        inferenceGen()
    if args.task=="seq2seq":
        logger.info("task is "+args.task)
        seq2seqGen()

    if args.task=="knowNet":
        logger.info("task is "+args.task)
        knowNetGen()
コード例 #9
0
            if record is not None:
                cache.append(json.dumps(record) + "\n")

            if len(cache) > args.batch_size:
                f.writelines(cache)
                cache.clear()
                f.flush()

        if len(cache) > 0:
            f.writelines(cache)
            cache.clear()


if __name__ == '__main__':

    parser = argparse.ArgumentParser()

    parser.add_argument('--batch_size', type=int, default=10)

    parser.add_argument('--relative_num', type=int, default=5)

    parser.add_argument('--target_answer_len', type=int, default=100)
    parser.add_argument('--answer_len', type=int, default=80)
    parser.add_argument('--question_len', type=int, default=80)

    args = parser.parse_args()

    docDB = MongoStackExchange(host='10.1.1.9', port=50000)
    docDB.useDB("posts")

    processData()