Example #1
0
def addToVocab(word, answer_id, priority_type, priority_val):
    if len(VOCAB_DICT) >= VOCAB_DICT_LIMIT:
        # create a temporary index file
        outfilePath = OUTPUT_DIR + TEMP_FILE_PREFIX + str(TEMP_FILE_COUNTER)
        FileUtil.writeToFile(VOCAB_DICT, outfilePath)
        TEMP_FILE_COUNTER = TEMP_FILE_COUNTER + 1
        VOCAB_DICT.clear()

    if word in VOCAB_DICT:
        answers = VOCAB_DICT[word]
        if answer_id in answers:
            prts = answers[answer_id]
            if priority_type in prts:
                prts[priority_type] = prts[priority_type] + priority_val
            else:
                prts[priority_type] = priority_val
        else:
            answers[answer_id] = {priority_type: priority_val}
    else:
        VOCAB_DICT[word] = {answer_id: {priority_type: priority_val}}
Example #2
0
def addToVocab(word, answer_id, priority_type, priority_val):
    if len(VOCAB_DICT) >= VOCAB_DICT_LIMIT:
        # create a temporary index file
        outfilePath = OUTPUT_DIR + TEMP_FILE_PREFIX + str(TEMP_FILE_COUNTER)
        FileUtil.writeToFile(VOCAB_DICT, outfilePath)
        TEMP_FILE_COUNTER = TEMP_FILE_COUNTER + 1
        VOCAB_DICT.clear()
    
    if word in VOCAB_DICT:
        answers = VOCAB_DICT[word]
        if answer_id in answers:
            prts = answers[answer_id]
            if priority_type in prts:
                prts[priority_type] = prts[priority_type] + priority_val
            else:
                prts[priority_type] = priority_val
        else:
            answers[answer_id] = {priority_type: priority_val}
    else:
        VOCAB_DICT[word] = {answer_id: {priority_type: priority_val}}
Example #3
0
            else:
                prts[priority_type] = priority_val
        else:
            answers[answer_id] = {priority_type: priority_val}
    else:
        VOCAB_DICT[word] = {answer_id: {priority_type: priority_val}}
        
def processText(text, priority):
    tFreq = Counter([stemmer.stem(kw) for kw in TextUtil.cleanUpText(text).split()])
    for tWord in tFreq:
        if tWord not in STOP_WORDS and len(tWord) > 2:
            addToVocab(tWord, ans_id, priority, tFreq[tWord])
        

linecount = 0
for (title, ques, ans) in FileUtil.readCMUQAData(qaFile1):
    ans_id = MongoUtil.getAnswerID(ans)
    MongoUtil.saveQARelation(ques, ans_id)
    
    # Work on title
    processText(title, 'p')
            
    # Work on ques
    processText(ques, 'q')
    
    # Work on ans
    processText(ans, 'r')
        
    linecount = linecount + 1
    if linecount % 100 == 0:
        print "Read %d QA data..." % linecount
Example #4
0
            else:
                prts[priority_type] = priority_val
        else:
            answers[answer_id] = {priority_type: priority_val}
    else:
        VOCAB_DICT[word] = {answer_id: {priority_type: priority_val}}
        
def processText(text, priority):
    tFreq = Counter([stemmer.stem(kw) for kw in TextUtil.cleanUpText(text).split()])
    for tWord in tFreq:
        if tWord not in STOP_WORDS and len(tWord) > 2:
            addToVocab(tWord, ans_id, priority, tFreq[tWord])
        

linecount = 0
for (title, ques, ans) in FileUtil.readNistQAData(qaFilesDir):
    ans_id = MongoUtil.getAnswerID(ans)
    MongoUtil.saveQARelation(ques, ans_id)
    
    # Work on title
    processText(title, 'p')
            
    # Work on ques
    processText(ques, 'q')
    
    # Work on ans
    processText(ans, 'r')
        
    linecount = linecount + 1
    if linecount % 100 == 0:
        print "Read %d QA files..." % linecount
Example #5
0
        else:
            answers[answer_id] = {priority_type: priority_val}
    else:
        VOCAB_DICT[word] = {answer_id: {priority_type: priority_val}}


def processText(text, priority):
    tFreq = Counter(
        [stemmer.stem(kw) for kw in TextUtil.cleanUpText(text).split()])
    for tWord in tFreq:
        if tWord not in STOP_WORDS and len(tWord) > 2:
            addToVocab(tWord, ans_id, priority, tFreq[tWord])


linecount = 0
for (title, ques, ans) in FileUtil.readJeopardyQAData(qaFilePath):
    ans_id = MongoUtil.getAnswerID(ans)
    MongoUtil.saveQARelation(ques, ans_id)

    # Work on title
    processText(title, 'p')

    # Work on ques
    processText(ques, 'q')

    # Work on ans
    processText(ans, 'r')

    linecount = linecount + 1
    if linecount % 100 == 0:
        print "Read %d QA files..." % linecount
Example #6
0
        SE_ANSID_COUNTER)  #MongoUtil.getAnswerID(ans)
    SE_ANSID_COUNTER += 1
    #MongoUtil.saveQARelation(ques, ans_id)
    ansid_file.write(
        (ans_id + '\t' + ques + '\t' + ans + '\n').encode('utf-8'))

    # Work on title
    processText(title, 'p')

    # Work on ques
    processText(ques, 'q')

    # Work on ans
    processText(ans, 'r')

    linecount = linecount + 1
    if linecount % 1000 == 0:
        print "time taken to read %d : " % linecount, time.time() - start_t
        start_t = time.time()

# Write to file if its still left
if len(VOCAB_DICT) > 0:
    # create a temporary index file
    outfilePath = OUTPUT_DIR + TEMP_FILE_PREFIX + str(TEMP_FILE_COUNTER)
    FileUtil.writeToFile(VOCAB_DICT, outfilePath)
    VOCAB_DICT.clear()

ansid_file.close()
print "Total time : ", time.time() - tot_t
print 'Done!'
Example #7
0
        else:
            answers[answer_id] = {priority_type: priority_val}
    else:
        VOCAB_DICT[word] = {answer_id: {priority_type: priority_val}}


def processText(text, priority):
    tFreq = Counter(
        [stemmer.stem(kw) for kw in TextUtil.cleanUpText(text).split()])
    for tWord in tFreq:
        if tWord not in STOP_WORDS and len(tWord) > 2:
            addToVocab(tWord, ans_id, priority, tFreq[tWord])


linecount = 0
for (title, ques, ans) in FileUtil.readNistQAData(qaFilesDir):
    ans_id = MongoUtil.getAnswerID(ans)
    MongoUtil.saveQARelation(ques, ans_id)

    # Work on title
    processText(title, 'p')

    # Work on ques
    processText(ques, 'q')

    # Work on ans
    processText(ans, 'r')

    linecount = linecount + 1
    if linecount % 100 == 0:
        print "Read %d QA files..." % linecount
Example #8
0
    ans_id = SE_ANSID_PREFIX + str(SE_ANSID_COUNTER)    #MongoUtil.getAnswerID(ans)
    SE_ANSID_COUNTER += 1
    #MongoUtil.saveQARelation(ques, ans_id)
    ansid_file.write((ans_id + '\t' + ques + '\t' + ans + '\n').encode('utf-8'))
    
    # Work on title
    processText(title, 'p')
    
    # Work on ques
    processText(ques, 'q')

    # Work on ans
    processText(ans, 'r')
        
    linecount = linecount + 1
    if linecount % 1000 == 0:
        print "time taken to read %d : " % linecount, time.time() - start_t
        start_t = time.time()
            
# Write to file if its still left
if len(VOCAB_DICT) > 0:
    # create a temporary index file
    outfilePath = OUTPUT_DIR + TEMP_FILE_PREFIX + str(TEMP_FILE_COUNTER)
    FileUtil.writeToFile(VOCAB_DICT, outfilePath)
    VOCAB_DICT.clear()
    
ansid_file.close()
print "Total time : ", time.time() - tot_t
print 'Done!'
            
    
Example #9
0
    def startElement(self, name, attrs):
        if name == 'row':
            try:
                if attrs['PostTypeId'] == '1' and 'AcceptedAnswerId' in attrs:
                    # Handle questions
                    qid = self.id_prefix + '.' + attrs['Id']
                    body = TextUtil.strip_tags(attrs['Body'])
                    title = attrs['Title']
                    aid = self.id_prefix + '.' + attrs['AcceptedAnswerId']
                    MongoUtil.saveSEQuestion(qid, body, title, aid)
                else:
                    # Handle answers
                    aid = self.id_prefix + '.' + attrs['Id']
                    body = TextUtil.strip_tags(attrs['Body'])
                    MongoUtil.saveSEAnswer(aid, body)
            except:
                pass
                
parser = xml.sax.make_parser()
for cfile in ['/home/brij/Documents/moody/datasets/stackexchange_data/stackoverflow/stackoverflow.com-Posts.7z']: #FileUtil.getSO7zFiles("/home/brij/Documents/moody/datasets/stackexchange_data/"):
    fname = FileUtil.getFilenameWithoutExt(cfile)
    print "Extracting %s ..." % fname
    FileUtil.extractPostsXml(cfile, OUTPUT_FOLDER)
    print "Parsing %s ..." % fname
    parser.setContentHandler(SEPostXmlHandler(fname))
    parser.parse(open(OUTPUT_XML, 'r'))
    print "Done parsing %s ..." % fname