def test3():
    from programmingalpha.DataSet.DBLoader import MongoStackExchange
    from programmingalpha.Utility.TextPreprocessing import PreprocessPostContent
    processor=PreprocessPostContent()
    db=MongoStackExchange(host='10.1.1.9',port=50000)
    dbName='stackoverflow'
    db.useDB(dbName)
    count=0
    threshold=0.2
    verbose=0
    for q in db.questions.find().batch_size(10000):
        txt=q['Title']+q['Body']
        codes=' '.join(processor.getCodeSnippets(txt))
        if len(codes) and verbose<10:
            print(len(codes),len(txt))
            verbose+=1

        if len(codes)/len(txt)>threshold:
            count+=1
    print("code question is {}/{}".format(count,db.questions.count()))

    count=0
    for ans in db.answers.find().batch_size(10000):
        txt=ans['Body']
        codes=' '.join(processor.getCodeSnippets(txt))
        if len(codes) and verbose<10:
            print(len(codes),len(txt))
            verbose+=1

        if len(codes)/len(txt)>threshold:
            count+=1
    print('code answer is {}/{}'.format(count,db.answers.count()))
Beispiel #2
0
def buildGraph(dbName):
    db = MongoStackExchange(host='10.1.1.9', port=50000)
    db.useDB(dbName)

    links = db.stackdb.get_collection("postlinks")

    allLinks = list(links.find().batch_size(args.batch_size))

    myG = {}
    for link in tqdm.tqdm(allLinks, desc="building graph from links"):
        id_a, id_b = link["PostId"], link["RelatedPostId"]
        r = link["LinkTypeId"]
        if r == 3:
            w = 0
        elif r == 1:
            w = 1
        else:
            raise ValueError("unexpected value {} for link type".format(r))

        if id_a in myG:
            myG[id_a][id_b] = w
        else:
            myG[id_a] = {id_b: w}

        if id_b in myG:
            myG[id_b][id_a] = w
        else:
            myG[id_b] = {id_a: w}

    logger.info("finished finding {} sublinks".format(len(allLinks)))

    return myG
Beispiel #3
0
def init(tokenizer_class):
    global PROCESS_TOK, PROCESS_DB
    PROCESS_TOK = tokenizer_class()
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_DB=MongoStackExchange(host='10.1.1.9',port='36666')
    PROCESS_DB.useDB(dbName)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
def buildGraph(dbName):
    db=MongoStackExchange(host='10.1.1.9',port=50000)
    db.useDB(dbName)

    links=db.stackdb.get_collection("postlinks")

    allLinks=list(links.find().batch_size(args.batch_size))

    G=nx.Graph()

    myG={}
    for link in tqdm.tqdm(allLinks,desc="building graph from links"):
        id_a,id_b=link["PostId"],link["RelatedPostId"]
        r=link["LinkTypeId"]
        if r==3:
            w=0
        elif r==1:
            w=1
        else:
            raise ValueError("unexpected value {} for link type".format(r))

        G.add_edge(id_a,id_b,weight=w)

        if id_a in myG:
            myG[id_a][id_b]=w
        else:
            myG[id_a]={id_b:w}

        if id_b in myG:
            myG[id_b][id_a]=w
        else:
            myG[id_b]={id_a:w}

    logger.info("finished finding {} sublinks".format(len(allLinks)))
    logger.info("graph size of edges({}) and nodes({})".format(len(list(G.edges)),len(list(G.nodes))))



    if len(G.nodes)<1e+4:
        return [G],G
    else:
        logger.info("cutting graph into small blocks")

    graphs=[]

    for cc in nx.connected_components(G):
        g=G.subgraph(cc)
        graphs.append(g)

    graphs.sort(key=lambda g:len(g.nodes),reverse=True)

    logger.info("num of subGs:{}".format(len(graphs)))
    subnodes=list(map(lambda g:len(g.nodes),graphs))[:10]
    logger.info("nodes of subG(top10):{}".format(subnodes))


    return graphs,G
def genResults():
    Qids = readQueryId("../../dataCases/query_list.txt")
    Summy = {}
    i = 0
    for i in range(100):
        Summy[Qids[i]] = readSummary("../../dataCases/Summary_list/%d.txt" % i)

    print(len(Qids), len(Summy))
    print(Summy)

    Answers = {}
    processor = PreprocessPostContent()
    docDB = MongoStackExchange(host="10.1.1.9", port=50000)
    docDB.useDB("stackoverflow")
    for qid in Qids:
        question = docDB.questions.find_one({"Id": qid})
        if not question:
            print("None Error", qid, question)
            continue
        #print(question)
        if "AcceptedAnswerId" in question and question["AcceptedAnswerId"]:
            ans = docDB.answers.find_one({"Id": question["AcceptedAnswerId"]
                                          })["Body"]
        else:
            answers = docDB.answers.find({"ParentId": qid})
            answers = list(answers)
            if len(answers) < 1:
                print("Error!", qid)
                continue
            answers.sort(key=lambda x: x["Score"], reverse=True)
            ans = answers[0]["Body"]

        ans = processor.getPlainTxt(ans)
        ans = " ".join(ans)
        Answers[qid] = {"true": ans, "generated": Summy[qid]}
        print(len(Answers), Answers[qid])
        #break

    with open("../../dataCases/answers.json", "w") as f:
        import json
        json.dump(Answers, f)
    questionsDataGlobal, ansIdxGlobal = fetchQuestionData(needed_qids)
    answersDataGlobal = fetchAnswerData(ansIdxGlobal,
                                        questionsDataGlobal.keys())

    questionsDataGlobal.update(unsolvedQuestionGlobal)
    generateContextAnswerCorpusParallel(distance_dataNew, questionsDataGlobal,
                                        answersDataGlobal)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', type=int, default=100)
    parser.add_argument('--samples', type=int, default=2000)

    parser.add_argument('--db', type=str, default="stackoverflow")
    parser.add_argument('--contextLen', type=int, default=1200)
    parser.add_argument('--questionLen', type=int, default=150)
    parser.add_argument('--lose_rate', type=float, default=0.5)

    parser.add_argument('--extractor', type=str, default="lexrankS")

    parser.add_argument('--workers', type=int, default=32)

    args = parser.parse_args()

    docDB = MongoStackExchange(host='10.1.1.9', port=50000)
    dbName = args.db
    docDB.useDB(dbName)

    main()
                                        "TagSeeds.json")["Tags"]

    if not m_tag:
        m_tag = [
            "<keras>", "<tensorflow>", "<caffe>", "<pytorch>",
            "<artificial-intelligence>", "<nlp>", "<computer-vision>",
            "<deep-learning>", "<neural-network>", "<machine-learning>",
            "<reinforcement-learning>", "<scikit-learn>"
        ]
    print("search init with %d tag seeds" % len(m_tag))

    minSuppport = 1000

    tuneMaxClipNum = None

    mongodb = MongoStackExchange(args.mongodb)
    dbname = "stackoverflow"
    tagCounter = TagCounter(mongodb, dbname)
    tagCounter.ItemSeeds.update(m_tag)

    #mine fp tree
    frequentItems = getFrequentItems()

    print("get %d frequent patterns" % len(frequentItems),
          "below are the frequent patterns of seeds")
    '''
    seedCounter=tagCounter.getTagCounter(m_tag)
    for i in range(len(m_tag)):
        for j in range(1,len(m_tag)):
            tag1,tag2=m_tag[i],m_tag[j]
            tagP=frozenset({tag1,tag2})
Beispiel #8
0
from programmingalpha.Utility.TextPreprocessing import PreprocessPostContent
import json

if __name__ == '__main__':
    from programmingalpha.DataSet.DBLoader import MongoStackExchange

    db = MongoStackExchange("mongodb://10.1.1.9")
    AIQA = db.stackdb["QAPForAI"]

    data = []
    with open("testdata/quetions.txt", "w") as f:
        for x in AIQA.find().batch_size(100):
            txt = x["question_title"] + " " + x["question_body"]

            processTxt = PreprocessPostContent()
            processTxt.raw_txt = txt
            result1, result2, result3 = processTxt.getEmCodes(
            ), processTxt.getCodeSnippets(), processTxt.getPlainTxt()

            data.append(
                json.dumps({
                    "emcodes": result1,
                    "snippets": result2,
                    "plaintxt": result3
                }) + "\n")

            if len(data) % 1000 == 0:
                f.writelines(data)
                data.clear()

        if len(data) > 0:
Beispiel #9
0
def initDB(dbName):

    db = MongoStackExchange(host='10.1.1.9', port=50000)
    db.useDB(dbName)

    return db
    with open(seq2seq_sample_file_dst,"w") as f:
        f.writelines(dataDst)


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', type=int, default=1000)
    parser.add_argument('--db', type=str, default="corpus")
    parser.add_argument('--maxSize', type=int, default=-1)
    parser.add_argument('--task', type=str, default="seq2seq")
    parser.add_argument('--contextLen', type=int, default=312)
    parser.add_argument('--questionLen', type=int, default=100)
    parser.add_argument('--answerLen', type=int, default=100)

    args = parser.parse_args()

    docDB=MongoStackExchange(host='10.1.1.9',port=50000)
    docDB.useDB(args.db)

    if args.task=="inference":
        logger.info("task is "+args.task)
        inferenceGen()
    if args.task=="seq2seq":
        logger.info("task is "+args.task)
        seq2seqGen()

    if args.task=="knowNet":
        logger.info("task is "+args.task)
        knowNetGen()
Beispiel #11
0
            if record is not None:
                cache.append(json.dumps(record) + "\n")

            if len(cache) > args.batch_size:
                f.writelines(cache)
                cache.clear()
                f.flush()

        if len(cache) > 0:
            f.writelines(cache)
            cache.clear()


if __name__ == '__main__':

    parser = argparse.ArgumentParser()

    parser.add_argument('--batch_size', type=int, default=10)

    parser.add_argument('--relative_num', type=int, default=5)

    parser.add_argument('--target_answer_len', type=int, default=100)
    parser.add_argument('--answer_len', type=int, default=80)
    parser.add_argument('--question_len', type=int, default=80)

    args = parser.parse_args()

    docDB = MongoStackExchange(host='10.1.1.9', port=50000)
    docDB.useDB("posts")

    processData()
from programmingalpha.DataSet.DBLoader import MongoStackExchange
from tqdm import tqdm
import pickle
import numpy as np
from matplotlib import pyplot

docDB=MongoStackExchange(host='10.1.1.9',port=50000)

def countAnswerLen():
    docDB.useDB('corpus')
    seq2seq=docDB.stackdb['seq2seq']
    answerLength=[]
    retrictedLen=[]
    for record in tqdm(seq2seq.find().batch_size(10000),desc="retrieving seq2seq record"):
        ansL=len(" ".join(record["answer"]).split())
        answerLength.append(ansL)
        if ansL<=200:
            retrictedLen.append(ansL)

    answerLength.sort()

    avg=np.mean(answerLength)
    std=np.std(answerLength)
    hist=np.histogram(answerLength)

    print(avg,std,len(answerLength),len(retrictedLen))
    print(hist)
    x=np.arange(len(answerLength))