def generateTestFeatures(infile):
    doc = Document(infile)
    #------------------------------------------------
    # For display and analysis
    dir, filename = os.path.split(infile)
    fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1)
    #------------------------------------------------
    all_sentences, all_offset = doc.all_sentences()
    ranker = TextRank(all_sentences)
    ranker.rank()
    num = 7
    x = 0
    test_sents = []
    sent_indices = []
    while num > 0:
        idx = ranker.scores[x][0] + all_offset
        x += 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_indices.append(idx)
            test_sents.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
        #------------------------------------------------
        # For display and analysis
        key = fcode + '-' + str(idx)
        test_data[key] = {
            'sentence': doc[idx].sentence.encode('utf-8'),
            'textrank': ranker.scores[x - 1][1],
            'contextpre': getContext(doc, idx, -2),
            'contextpos': getContext(doc, idx, 2)
        }
    #-----------------------------------------
    for sentence, sent_idx in zip(test_sents, sent_indices):
        key = fcode + '-' + str(sent_idx)
        print key
        print test_data[key]['contextpre']
        print "----Main sentence Start----"
        print test_data[key]['sentence']
        print "----Main sentence End----"
        print test_data[key]['contextpos']
        feature_string = raw_input()
        feature_string += '1'
        test_data[key]['reallbl'] = feature_string
def generateTestFeatures(infile):
    doc = Document(infile)
    #------------------------------------------------
    # For display and analysis
    dir, filename = os.path.split(infile)
    fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1)
    #------------------------------------------------
    all_sentences, all_offset = doc.all_sentences()
    ranker = TextRank(all_sentences)
    ranker.rank()
    num = 7
    x = 0
    test_sents = []
    sent_indices = []
    while num > 0:
        idx = ranker.scores[x][0] + all_offset
        x += 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_indices.append(idx)
            test_sents.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
        #------------------------------------------------
        # For display and analysis
        key = fcode + '-' + str(idx)
        test_data[key] = {'sentence': doc[idx].sentence.encode('utf-8'),
                          'textrank': ranker.scores[x - 1][1],
                          'contextpre': getContext(doc, idx, -2),
                          'contextpos': getContext(doc, idx, 2)}
    #-----------------------------------------
    for sentence, sent_idx in zip(test_sents, sent_indices):
        key = fcode + '-' + str(sent_idx)
        print key
        print test_data[key]['contextpre']
        print "----Main sentence Start----"
        print test_data[key]['sentence']
        print "----Main sentence End----"
        print test_data[key]['contextpos']
        feature_string = raw_input()
        feature_string += '1'
        test_data[key]['reallbl'] = feature_string
def classifyDoc(document):
    featurefile = DIR['DATA'] + 'features_svm.txt'
    classify = DIR['BASE'] + "lib/svm-light/svm_classify"
    model = DIR['DATA'] + "sec-tfidf-model.txt"
    outfile = DIR['DATA'] + "svm-out-sent.txt"
    #sumlength = 5
    client_socket = getConnection()
    doc = Document(document)
    #-----------------------------------------
    # Clubbing sentences in sections and passing to the ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sents, offset = doc.all_sentences()
    ranker = TextRank(sents)
    ranker.rank()
    looper = 20
    num = 10
    x = 0
    summary = []
    sent_idx = [0]
    sum_len = 0
    while num > 0:
        idx = ranker.scores[x][0] + offset
        x += 1
        if not validSentence(doc[idx]):
            continue
        elif doc.get_section_name(idx) == 'abstract':
            continue
        sent_idx[0] = idx
        #-----------------------------------------
        # dependency parse
        tree = parseTrees(getDepParse(client_socket,
                                      doc[idx].sentence.encode('utf-8')))
        #-----------------------------------------
        # The sent_idx needs to be converted to reflect the corresponding
        # section index
        sec_idx = sent2Section(doc, sent_idx)
        #-----------------------------------------
        deleteFiles([featurefile])
        feature_string = "+1"
        feature_string += processTree(tree, sec_ranker, sec_idx[0], False)
        writeToFile(featurefile, feature_string + '\n', 'a')
        deleteFiles([outfile])
        subprocess.call([classify, featurefile, model, outfile])
        with open(outfile, 'r') as ofile:
            sent_val = float(ofile.read().strip())
        if sent_val > 0:
            summary.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
            sum_len += len(doc[idx].sentence.encode('utf-8').split(' '))
        if sum_len > 130:
            break
        looper -= 1
        if looper == 0:
            print "Looper Done"
            break
    writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w')
    print '\n'.join(summary)
Exemple #4
0
def classifyDoc(document):
    featurefile = DIR['DATA'] + 'features_svm.txt'
    classify = DIR['BASE'] + "lib/svm-light/svm_classify"
    model = DIR['DATA'] + "sec-tfidf-model.txt"
    outfile = DIR['DATA'] + "svm-out-sent.txt"
    #sumlength = 5
    client_socket = getConnection()
    doc = Document(document)
    #-----------------------------------------
    # Clubbing sentences in sections and passing to the ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sents, offset = doc.all_sentences()
    ranker = TextRank(sents)
    ranker.rank()
    looper = 20
    num = 10
    x = 0
    summary = []
    sent_idx = [0]
    sum_len = 0
    while num > 0:
        idx = ranker.scores[x][0] + offset
        x += 1
        if not validSentence(doc[idx]):
            continue
        elif doc.get_section_name(idx) == 'abstract':
            continue
        sent_idx[0] = idx
        #-----------------------------------------
        # dependency parse
        tree = parseTrees(
            getDepParse(client_socket, doc[idx].sentence.encode('utf-8')))
        #-----------------------------------------
        # The sent_idx needs to be converted to reflect the corresponding
        # section index
        sec_idx = sent2Section(doc, sent_idx)
        #-----------------------------------------
        deleteFiles([featurefile])
        feature_string = "+1"
        feature_string += processTree(tree, sec_ranker, sec_idx[0], False)
        writeToFile(featurefile, feature_string + '\n', 'a')
        deleteFiles([outfile])
        subprocess.call([classify, featurefile, model, outfile])
        with open(outfile, 'r') as ofile:
            sent_val = float(ofile.read().strip())
        if sent_val > 0:
            summary.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
            sum_len += len(doc[idx].sentence.encode('utf-8').split(' '))
        if sum_len > 130:
            break
        looper -= 1
        if looper == 0:
            print "Looper Done"
            break
    writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w')
    print '\n'.join(summary)