コード例 #1
0
def test3():
    from post_rec.DataSet.DBLoader import MongoStackExchange
    from post_rec.Utility.TextPreprocessing import PreprocessPostContent
    processor = PreprocessPostContent()
    db = MongoStackExchange(host='10.1.1.9', port=50000)
    dbName = 'stackoverflow'
    db.useDB(dbName)
    count = 0
    threshold = 0.2
    verbose = 0
    for q in db.questions.find().batch_size(10000):
        txt = q['Title'] + q['Body']
        codes = ' '.join(processor.getCodeSnippets(txt))
        if len(codes) and verbose < 10:
            print(len(codes), len(txt))
            verbose += 1

        if len(codes) / len(txt) > threshold:
            count += 1
    print("code question is {}/{}".format(count, db.questions.count()))

    count = 0
    for ans in db.answers.find().batch_size(10000):
        txt = ans['Body']
        codes = ' '.join(processor.getCodeSnippets(txt))
        if len(codes) and verbose < 10:
            print(len(codes), len(txt))
            verbose += 1

        if len(codes) / len(txt) > threshold:
            count += 1
    print('code answer is {}/{}'.format(count, db.answers.count()))
コード例 #2
0
def init(tokenizer_class):
    global PROCESS_TOK, PROCESS_DB
    PROCESS_TOK = tokenizer_class()
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_DB = MongoStackExchange(host='10.1.1.9', port='36666')
    PROCESS_DB.useDB(dbName)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
コード例 #3
0
def buildGraph(dbName):
    db=MongoStackExchange(host='10.1.1.9',port=50000)
    db.useDB(dbName)

    links=db.stackdb.get_collection("postlinks")

    allLinks=list(links.find().batch_size(args.batch_size))


    myG={}
    for link in tqdm.tqdm(allLinks,desc="building graph from links"):
        id_a,id_b=link["PostId"],link["RelatedPostId"]
        r=link["LinkTypeId"]
        if r==3:
            w=0
        elif r==1:
            w=1
        else:
            raise ValueError("unexpected value {} for link type".format(r))


        if id_a in myG:
            myG[id_a][id_b]=w
        else:
            myG[id_a]={id_b:w}

        if id_b in myG:
            myG[id_b][id_a]=w
        else:
            myG[id_b]={id_a:w}

    logger.info("finished finding {} sublinks".format(len(allLinks)))

    return myG
コード例 #4
0
def distinctMoveStack(src, dst):
    db = MongoStackExchange("10.1.1.9", "36666")
    db.useDB("stackoverflow")
    id_set = set()

    dst_collection = db.stackdb[dst]
    src_collection = db.stackdb[src]

    insert_cache = []
    print("count before distinct", src_collection.count())

    for doc in src_collection.find().batch_size(10000):
        if doc["Id"] in id_set:
            continue
        id_set.add(doc["Id"])
        insert_cache.append(doc)
        if len(insert_cache) % 10000 == 0:
            dst_collection.insert_many(insert_cache)
            insert_cache.clear()
            print("process {}/{}".format(dst_collection.count(),
                                         src_collection.count()))
    if len(insert_cache) > 0:
        dst_collection.insert_many(insert_cache)
        print("process {}/{}".format(dst_collection.count(),
                                     src_collection.count()))

    print("count after distinct", dst_collection.count())
コード例 #5
0
def buildGraph(dbName):
    db = MongoStackExchange(host='10.1.1.9', port=50000)
    db.useDB(dbName)

    links = db.stackdb.get_collection("postlinks")

    allLinks = list(links.find().batch_size(args.batch_size))

    G = nx.Graph()

    myG = {}
    for link in tqdm.tqdm(allLinks, desc="building graph from links"):
        id_a, id_b = link["PostId"], link["RelatedPostId"]
        r = link["LinkTypeId"]
        if r == 3:
            w = 0
        elif r == 1:
            w = 1
        else:
            raise ValueError("unexpected value {} for link type".format(r))

        G.add_edge(id_a, id_b, weight=w)

        if id_a in myG:
            myG[id_a][id_b] = w
        else:
            myG[id_a] = {id_b: w}

        if id_b in myG:
            myG[id_b][id_a] = w
        else:
            myG[id_b] = {id_a: w}

    logger.info("finished finding {} sublinks".format(len(allLinks)))
    logger.info("graph size of edges({}) and nodes({})".format(
        len(list(G.edges)), len(list(G.nodes))))

    if len(G.nodes) < 1e+4:
        return [G], G
    else:
        logger.info("cutting graph into small blocks")

    graphs = []

    for cc in nx.connected_components(G):
        g = G.subgraph(cc)
        graphs.append(g)

    graphs.sort(key=lambda g: len(g.nodes), reverse=True)

    logger.info("num of subGs:{}".format(len(graphs)))
    subnodes = list(map(lambda g: len(g.nodes), graphs))[:10]
    logger.info("nodes of subG(top10):{}".format(subnodes))

    return graphs, G
コード例 #6
0
def genResults():
    Qids = readQueryId("../../dataCases/query_list.txt")
    Summy = {}
    i = 0
    for i in range(100):
        Summy[Qids[i]] = readSummary("../../dataCases/Summary_list/%d.txt" % i)

    print(len(Qids), len(Summy))
    print(Summy)

    Answers = {}
    processor = PreprocessPostContent()
    docDB = MongoStackExchange(host="10.1.1.9", port=50000)
    docDB.useDB("stackoverflow")
    for qid in Qids:
        question = docDB.questions.find_one({"Id": qid})
        if not question:
            print("None Error", qid, question)
            continue
        #print(question)
        if "AcceptedAnswerId" in question and question["AcceptedAnswerId"]:
            ans = docDB.answers.find_one({"Id": question["AcceptedAnswerId"]
                                          })["Body"]
        else:
            answers = docDB.answers.find({"ParentId": qid})
            answers = list(answers)
            if len(answers) < 1:
                print("Error!", qid)
                continue
            answers.sort(key=lambda x: x["Score"], reverse=True)
            ans = answers[0]["Body"]

        ans = processor.getPlainTxt(ans)
        ans = " ".join(ans)
        Answers[qid] = {"true": ans, "generated": Summy[qid]}
        print(len(Answers), Answers[qid])
        #break

    with open("../../dataCases/answers.json", "w") as f:
        import json
        json.dump(Answers, f)
コード例 #7
0
    labelDataNew = []
    for ld in labelData:
        id1, id2 = ld["pair"]
        if id1 not in q_ids_set or id2 not in q_ids_set:
            continue
        labelDataNew.append(ld)

    labels = map(lambda ll: ll["label"], labelData)

    import collections
    logger.info(collections.Counter(labels))

    generateQuestionCorpus(labelDataNew, postData_local)


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', type=int, default=1000)
    parser.add_argument('--source', type=str, default="crossvalidated")

    args = parser.parse_args()

    docDB = MongoStackExchange(host='10.1.1.9', port=50000)
    docDB.useDB("posts")

    logger.info("task source is {}".format(args.source))

    main()
コード例 #8
0
def initDB(dbName):

    db=MongoStackExchange(host='10.1.1.9',port=50000)
    db.useDB(dbName)

    return db
コード例 #9
0
        if len(cache)>0:
            f.writelines(cache)
            cache.clear()

def main():

    questionsDataGlobal=fetchQuestionData()
    answersDataGlobal=fetchAnswerData(questionsDataGlobal.keys())
    indexerDataGlobal=fetchIndexData(questionsDataGlobal.keys())

    generateContextAnswerCorpusParallel(questionsDataGlobal,answersDataGlobal,indexerDataGlobal)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', type=int, default=1000)

    parser.add_argument('--db', type=str, default="crossvalidated")

    parser.add_argument('--workers', type=int, default=10)

    args = parser.parse_args()

    docDB=MongoStackExchange(host='10.1.1.9',port=50000)
    dbName=args.db
    docDB.useDB(dbName)

    logger.info("processing db data: {}".format(dbName))

    main()
コード例 #10
0
from post_rec.DataSet.DBLoader import MongoStackExchange
from tqdm import tqdm
import pickle
import numpy as np
from matplotlib import pyplot

docDB = MongoStackExchange(host='10.1.1.9', port=50000)


def countAnswerLen():
    docDB.useDB('corpus')
    seq2seq = docDB.stackdb['seq2seq']
    answerLength = []
    retrictedLen = []
    for record in tqdm(seq2seq.find().batch_size(10000),
                       desc="retrieving seq2seq record"):
        ansL = len(" ".join(record["answer"]).split())
        answerLength.append(ansL)
        if ansL <= 200:
            retrictedLen.append(ansL)

    answerLength.sort()

    avg = np.mean(answerLength)
    std = np.std(answerLength)
    hist = np.histogram(answerLength)

    print(avg, std, len(answerLength), len(retrictedLen))
    print(hist)
    x = np.arange(len(answerLength))
コード例 #11
0
    args = parser.parse_args()

    #load m_tags
    m_tag=post_rec.loadConfig(post_rec.ConfigPath + "TagSeeds.json")["Tags"]

    if not m_tag:
        m_tag=["<keras>","<tensorflow>","<caffe>","<pytorch>","<artificial-intelligence>","<nlp>","<computer-vision>",
               "<deep-learning>","<neural-network>","<machine-learning>","<reinforcement-learning>","<scikit-learn>"]
    print("search init with %d tag seeds"%len(m_tag))

    minSuppport=1000

    tuneMaxClipNum=None

    mongodb=MongoStackExchange(args.mongodb)
    dbname="stackoverflow"
    tagCounter=TagCounter(mongodb,dbname)
    tagCounter.ItemSeeds.update(m_tag)

    #mine fp tree
    frequentItems=getFrequentItems()

    print("get %d frequent patterns"%len(frequentItems),"below are the frequent patterns of seeds")

    '''
    seedCounter=tagCounter.getTagCounter(m_tag)
    for i in range(len(m_tag)):
        for j in range(1,len(m_tag)):
            tag1,tag2=m_tag[i],m_tag[j]
            tagP=frozenset({tag1,tag2})
コード例 #12
0
    with open(seq2seq_sample_file_dst, "w") as f:
        f.writelines(dataDst)


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', type=int, default=1000)
    parser.add_argument('--db', type=str, default="corpus")
    parser.add_argument('--maxSize', type=int, default=-1)
    parser.add_argument('--task', type=str, default="seq2seq")
    parser.add_argument('--contextLen', type=int, default=312)
    parser.add_argument('--questionLen', type=int, default=100)
    parser.add_argument('--answerLen', type=int, default=100)

    args = parser.parse_args()

    docDB = MongoStackExchange(host='10.1.1.9', port=50000)
    docDB.useDB(args.db)

    if args.task == "inference":
        logger.info("task is " + args.task)
        inferenceGen()
    if args.task == "seq2seq":
        logger.info("task is " + args.task)
        seq2seqGen()

    if args.task == "knowNet":
        logger.info("task is " + args.task)
        knowNetGen()
コード例 #13
0
def initPool():
    global db
    db = MongoStackExchange(**MongodbAuth)