def buildGraph(dbName):
    db=MongoStackExchange(host='10.1.1.9',port=50000)
    db.useDB(dbName)

    links=db.stackdb.get_collection("postlinks")

    allLinks=list(links.find().batch_size(args.batch_size))


    myG={}
    for link in tqdm.tqdm(allLinks,desc="building graph from links"):
        id_a,id_b=link["PostId"],link["RelatedPostId"]
        r=link["LinkTypeId"]
        if r==3:
            w=0
        elif r==1:
            w=1
        else:
            raise ValueError("unexpected value {} for link type".format(r))


        if id_a in myG:
            myG[id_a][id_b]=w
        else:
            myG[id_a]={id_b:w}

        if id_b in myG:
            myG[id_b][id_a]=w
        else:
            myG[id_b]={id_a:w}

    logger.info("finished finding {} sublinks".format(len(allLinks)))

    return myG
Example #2
0
def test3():
    from post_rec.DataSet.DBLoader import MongoStackExchange
    from post_rec.Utility.TextPreprocessing import PreprocessPostContent
    processor = PreprocessPostContent()
    db = MongoStackExchange(host='10.1.1.9', port=50000)
    dbName = 'stackoverflow'
    db.useDB(dbName)
    count = 0
    threshold = 0.2
    verbose = 0
    for q in db.questions.find().batch_size(10000):
        txt = q['Title'] + q['Body']
        codes = ' '.join(processor.getCodeSnippets(txt))
        if len(codes) and verbose < 10:
            print(len(codes), len(txt))
            verbose += 1

        if len(codes) / len(txt) > threshold:
            count += 1
    print("code question is {}/{}".format(count, db.questions.count()))

    count = 0
    for ans in db.answers.find().batch_size(10000):
        txt = ans['Body']
        codes = ' '.join(processor.getCodeSnippets(txt))
        if len(codes) and verbose < 10:
            print(len(codes), len(txt))
            verbose += 1

        if len(codes) / len(txt) > threshold:
            count += 1
    print('code answer is {}/{}'.format(count, db.answers.count()))
def init(tokenizer_class):
    global PROCESS_TOK, PROCESS_DB
    PROCESS_TOK = tokenizer_class()
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_DB = MongoStackExchange(host='10.1.1.9', port='36666')
    PROCESS_DB.useDB(dbName)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
Example #4
0
def distinctMoveStack(src, dst):
    db = MongoStackExchange("10.1.1.9", "36666")
    db.useDB("stackoverflow")
    id_set = set()

    dst_collection = db.stackdb[dst]
    src_collection = db.stackdb[src]

    insert_cache = []
    print("count before distinct", src_collection.count())

    for doc in src_collection.find().batch_size(10000):
        if doc["Id"] in id_set:
            continue
        id_set.add(doc["Id"])
        insert_cache.append(doc)
        if len(insert_cache) % 10000 == 0:
            dst_collection.insert_many(insert_cache)
            insert_cache.clear()
            print("process {}/{}".format(dst_collection.count(),
                                         src_collection.count()))
    if len(insert_cache) > 0:
        dst_collection.insert_many(insert_cache)
        print("process {}/{}".format(dst_collection.count(),
                                     src_collection.count()))

    print("count after distinct", dst_collection.count())
def buildGraph(dbName):
    db = MongoStackExchange(host='10.1.1.9', port=50000)
    db.useDB(dbName)

    links = db.stackdb.get_collection("postlinks")

    allLinks = list(links.find().batch_size(args.batch_size))

    G = nx.Graph()

    myG = {}
    for link in tqdm.tqdm(allLinks, desc="building graph from links"):
        id_a, id_b = link["PostId"], link["RelatedPostId"]
        r = link["LinkTypeId"]
        if r == 3:
            w = 0
        elif r == 1:
            w = 1
        else:
            raise ValueError("unexpected value {} for link type".format(r))

        G.add_edge(id_a, id_b, weight=w)

        if id_a in myG:
            myG[id_a][id_b] = w
        else:
            myG[id_a] = {id_b: w}

        if id_b in myG:
            myG[id_b][id_a] = w
        else:
            myG[id_b] = {id_a: w}

    logger.info("finished finding {} sublinks".format(len(allLinks)))
    logger.info("graph size of edges({}) and nodes({})".format(
        len(list(G.edges)), len(list(G.nodes))))

    if len(G.nodes) < 1e+4:
        return [G], G
    else:
        logger.info("cutting graph into small blocks")

    graphs = []

    for cc in nx.connected_components(G):
        g = G.subgraph(cc)
        graphs.append(g)

    graphs.sort(key=lambda g: len(g.nodes), reverse=True)

    logger.info("num of subGs:{}".format(len(graphs)))
    subnodes = list(map(lambda g: len(g.nodes), graphs))[:10]
    logger.info("nodes of subG(top10):{}".format(subnodes))

    return graphs, G
Example #6
0
def genResults():
    Qids = readQueryId("../../dataCases/query_list.txt")
    Summy = {}
    i = 0
    for i in range(100):
        Summy[Qids[i]] = readSummary("../../dataCases/Summary_list/%d.txt" % i)

    print(len(Qids), len(Summy))
    print(Summy)

    Answers = {}
    processor = PreprocessPostContent()
    docDB = MongoStackExchange(host="10.1.1.9", port=50000)
    docDB.useDB("stackoverflow")
    for qid in Qids:
        question = docDB.questions.find_one({"Id": qid})
        if not question:
            print("None Error", qid, question)
            continue
        #print(question)
        if "AcceptedAnswerId" in question and question["AcceptedAnswerId"]:
            ans = docDB.answers.find_one({"Id": question["AcceptedAnswerId"]
                                          })["Body"]
        else:
            answers = docDB.answers.find({"ParentId": qid})
            answers = list(answers)
            if len(answers) < 1:
                print("Error!", qid)
                continue
            answers.sort(key=lambda x: x["Score"], reverse=True)
            ans = answers[0]["Body"]

        ans = processor.getPlainTxt(ans)
        ans = " ".join(ans)
        Answers[qid] = {"true": ans, "generated": Summy[qid]}
        print(len(Answers), Answers[qid])
        #break

    with open("../../dataCases/answers.json", "w") as f:
        import json
        json.dump(Answers, f)
Example #7
0
    labelDataNew = []
    for ld in labelData:
        id1, id2 = ld["pair"]
        if id1 not in q_ids_set or id2 not in q_ids_set:
            continue
        labelDataNew.append(ld)

    labels = map(lambda ll: ll["label"], labelData)

    import collections
    logger.info(collections.Counter(labels))

    generateQuestionCorpus(labelDataNew, postData_local)


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', type=int, default=1000)
    parser.add_argument('--source', type=str, default="crossvalidated")

    args = parser.parse_args()

    docDB = MongoStackExchange(host='10.1.1.9', port=50000)
    docDB.useDB("posts")

    logger.info("task source is {}".format(args.source))

    main()
Example #8
0
def initDB(dbName):

    db=MongoStackExchange(host='10.1.1.9',port=50000)
    db.useDB(dbName)

    return db
Example #9
0
        if len(cache)>0:
            f.writelines(cache)
            cache.clear()

def main():

    questionsDataGlobal=fetchQuestionData()
    answersDataGlobal=fetchAnswerData(questionsDataGlobal.keys())
    indexerDataGlobal=fetchIndexData(questionsDataGlobal.keys())

    generateContextAnswerCorpusParallel(questionsDataGlobal,answersDataGlobal,indexerDataGlobal)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', type=int, default=1000)

    parser.add_argument('--db', type=str, default="crossvalidated")

    parser.add_argument('--workers', type=int, default=10)

    args = parser.parse_args()

    docDB=MongoStackExchange(host='10.1.1.9',port=50000)
    dbName=args.db
    docDB.useDB(dbName)

    logger.info("processing db data: {}".format(dbName))

    main()
Example #10
0
    with open(seq2seq_sample_file_dst, "w") as f:
        f.writelines(dataDst)


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', type=int, default=1000)
    parser.add_argument('--db', type=str, default="corpus")
    parser.add_argument('--maxSize', type=int, default=-1)
    parser.add_argument('--task', type=str, default="seq2seq")
    parser.add_argument('--contextLen', type=int, default=312)
    parser.add_argument('--questionLen', type=int, default=100)
    parser.add_argument('--answerLen', type=int, default=100)

    args = parser.parse_args()

    docDB = MongoStackExchange(host='10.1.1.9', port=50000)
    docDB.useDB(args.db)

    if args.task == "inference":
        logger.info("task is " + args.task)
        inferenceGen()
    if args.task == "seq2seq":
        logger.info("task is " + args.task)
        seq2seqGen()

    if args.task == "knowNet":
        logger.info("task is " + args.task)
        knowNetGen()