def classifyDoc(document):
    featurefile = DIR['DATA'] + 'features_svm.txt'
    classify = DIR['BASE'] + "lib/svm-light/svm_classify"
    model = DIR['DATA'] + "sec-tfidf-model.txt"
    outfile = DIR['DATA'] + "svm-out-sent.txt"
    #sumlength = 5
    client_socket = getConnection()
    doc = Document(document)
    #-----------------------------------------
    # Clubbing sentences in sections and passing to the ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sents, offset = doc.all_sentences()
    ranker = TextRank(sents)
    ranker.rank()
    #-----------------------------------------
    sents, sent_indices = getSecRankedSent(doc)
    #-----------------------------------------
    # The sent_idx needs to be converted to reflect the corresponding
    # section index
    sec_indices = sent2Section(doc, sent_indices)
    summary = []
    classified = []
    sum_len = 0
    for sent, sec_idx in zip(sents, sec_indices):
        #-----------------------------------------
        # dependency parse
        tree = parseTrees(getDepParse(client_socket, sent))
        #-----------------------------------------
        deleteFiles([featurefile])
        feature_string = "+1"
        feature_string += processTree(tree, sec_ranker, sec_idx, False)
        writeToFile(featurefile, feature_string + '\n', 'a')
        deleteFiles([outfile])
        subprocess.call([classify, featurefile, model, outfile])
        with open(outfile, 'r') as ofile:
            sent_val = float(ofile.read().strip())
            classified.append((sent, sent_val))
    for sent, val in sorted(classified, key=itemgetter(1)):
        summary.append(sent)
        sum_len += len(sent.split(' '))
        if sum_len > 130:
            break
    writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w')
    print '\n'.join(summary)
Example #2
0
def mainline(train=False):
    datadir = DIR['BASE'] + "data/"
    if train is True:
        featurefile = datadir + 'train-features.txt'
        xmldir = DIR['BASE'] + "demo/train/"
    else:
        featurefile = datadir + 'test-features.txt'
        xmldir = DIR['BASE'] + "demo/test/"
    deleteFiles([featurefile])
    #infile = xmldir + 'C08-1122-parscit-section.xml'
    client_socket = getConnection()
    for infile in glob(xmldir + "*.xml"):
        try:
            print infile + " is being processed."
            if train is True:
                generateTrainFeatures(client_socket, infile, featurefile)
            else:
                generateTestFeatures(client_socket, infile, featurefile)
        except Exception as e:
            print "Some Exception in the main pipeline"
            print (str(type(e)))
            print str(e)
            logging.exception("Something awfull !!")
    model = DIR['DATA'] + "sec-tfidf-model.txt"
    if train is False:
        # Testing
        outfile = DIR['DATA'] + "sec-tfidf-test-out.txt"
        for gamma in [1.0]:
            predictSvm(featurefile, model + str(gamma), outfile)
            outstring = "Testing. Weight : " + str(gamma)
            analyze(featurefile, outfile, outstring)
        #pickleIt()
    else:
        # Training
        outfile = DIR['DATA'] + "sec-tfidf-train-out.txt"
        deleteFiles([outfile])
        for gamma in [1.0]:
            #trainSvm(featurefile, model + str(gamma), gamma)
            trainSvm(featurefile, model, gamma)
            predictSvm(featurefile, model, outfile)
            outstring = "Training. gamma : " + str(gamma)
            analyze(featurefile, outfile, outstring=outstring)
        pickleIt()
Example #3
0
def mainline(train=False):
    datadir = DIR["BASE"] + "data/"
    if train is True:
        featurefile = datadir + "train-features.txt"
        xmldir = DIR["BASE"] + "demo/train/"
    else:
        featurefile = datadir + "test-features.txt"
        xmldir = DIR["BASE"] + "demo/test/"
    deleteFiles([featurefile])
    # infile = xmldir + 'C08-1122-parscit-section.xml'
    client_socket = getConnection()
    for infile in glob(xmldir + "*.xml"):
        try:
            print infile + " is being processed."
            if train is True:
                generateTrainFeatures(client_socket, infile, featurefile)
            else:
                generateTestFeatures(client_socket, infile, featurefile)
        except Exception as e:
            print "Some Exception in the main pipeline"
            print (str(type(e)))
            print str(e)
            logging.exception("Something awfull !!")
    model = DIR["DATA"] + "sec-tfidf-model.txt"
    if train is False:
        # TESTING
        outfile = DIR["DATA"] + "sec-tfidf-test-out.txt"
        predictSvm(featurefile, model, outfile)
        extractValues(outfile)
        outstring = "Default values Test results"
        analyze(featurefile, outfile, outstring=outstring)
        pickleIt()
    else:
        # TRAINING
        trainSvm(featurefile, model)
        outfile = DIR["DATA"] + "sec-tfidf-train-out.txt"
        predictSvm(featurefile, model, outfile)
        outstring = "Default values"
        analyze(featurefile, outfile, outstring=outstring)
Example #4
0
def mainline(train=False):
    datadir = DIR['BASE'] + "data/"
    if train is True:
        featurefile = datadir + 'train-features.txt'
        xmldir = DIR['BASE'] + "demo/train/"
    else:
        featurefile = datadir + 'test-features.txt'
        xmldir = DIR['BASE'] + "demo/test/"
    deleteFiles([featurefile])
    #infile = xmldir + 'C08-1122-parscit-section.xml'
    client_socket = getConnection()
    for infile in glob(xmldir + "*.xml"):
        try:
            print infile + " is being processed."
            if train is True:
                generateTrainFeatures(client_socket, infile, featurefile)
            else:
                generateTestFeatures(client_socket, infile, featurefile)
        except Exception as e:
            print "Some Exception in the main pipeline"
            print(str(type(e)))
            print str(e)
            logging.exception("Something awfull !!")
    model = DIR['DATA'] + "sec-tfidf-model.txt"
    if train is False:
        # TESTING
        outfile = DIR['DATA'] + "sec-tfidf-test-out.txt"
        predictSvm(featurefile, model, outfile)
        extractValues(outfile)
        outstring = "Default values Test results"
        analyze(featurefile, outfile, outstring=outstring)
        pickleIt()
    else:
        # TRAINING
        trainSvm(featurefile, model)
        outfile = DIR['DATA'] + "sec-tfidf-train-out.txt"
        predictSvm(featurefile, model, outfile)
        outstring = "Default values"
        analyze(featurefile, outfile, outstring=outstring)
Example #5
0
def classifyDoc(document):
    featurefile = DIR['DATA'] + 'features_svm.txt'
    classify = DIR['BASE'] + "lib/svm-light/svm_classify"
    model = DIR['DATA'] + "sec-tfidf-model.txt"
    outfile = DIR['DATA'] + "svm-out-sent.txt"
    #sumlength = 5
    client_socket = getConnection()
    doc = Document(document)
    #-----------------------------------------
    # Clubbing sentences in sections and passing to the ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sents, offset = doc.all_sentences()
    ranker = TextRank(sents)
    ranker.rank()
    looper = 20
    num = 10
    x = 0
    summary = []
    sent_idx = [0]
    sum_len = 0
    while num > 0:
        idx = ranker.scores[x][0] + offset
        x += 1
        if not validSentence(doc[idx]):
            continue
        elif doc.get_section_name(idx) == 'abstract':
            continue
        sent_idx[0] = idx
        #-----------------------------------------
        # dependency parse
        tree = parseTrees(getDepParse(client_socket,
                                      doc[idx].sentence.encode('utf-8')))
        #-----------------------------------------
        # The sent_idx needs to be converted to reflect the corresponding
        # section index
        sec_idx = sent2Section(doc, sent_idx)
        #-----------------------------------------
        deleteFiles([featurefile])
        feature_string = "+1"
        feature_string += processTree(tree, sec_ranker, sec_idx[0], False)
        writeToFile(featurefile, feature_string + '\n', 'a')
        deleteFiles([outfile])
        subprocess.call([classify, featurefile, model, outfile])
        with open(outfile, 'r') as ofile:
            sent_val = float(ofile.read().strip())
        if sent_val > 0:
            summary.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
            sum_len += len(doc[idx].sentence.encode('utf-8').split(' '))
        if sum_len > 130:
            break
        looper -= 1
        if looper == 0:
            print "Looper Done"
            break
    writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w')
    print '\n'.join(summary)
Example #6
0
def classifyDoc(document):
    featurefile = DIR['DATA'] + 'features_svm.txt'
    classify = DIR['BASE'] + "lib/svm-light/svm_classify"
    model = DIR['DATA'] + "sec-tfidf-model.txt"
    outfile = DIR['DATA'] + "svm-out-sent.txt"
    #sumlength = 5
    client_socket = getConnection()
    doc = Document(document)
    #-----------------------------------------
    # Clubbing sentences in sections and passing to the ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sents, offset = doc.all_sentences()
    ranker = TextRank(sents)
    ranker.rank()
    looper = 20
    num = 10
    x = 0
    summary = []
    sent_idx = [0]
    sum_len = 0
    while num > 0:
        idx = ranker.scores[x][0] + offset
        x += 1
        if not validSentence(doc[idx]):
            continue
        elif doc.get_section_name(idx) == 'abstract':
            continue
        sent_idx[0] = idx
        #-----------------------------------------
        # dependency parse
        tree = parseTrees(
            getDepParse(client_socket, doc[idx].sentence.encode('utf-8')))
        #-----------------------------------------
        # The sent_idx needs to be converted to reflect the corresponding
        # section index
        sec_idx = sent2Section(doc, sent_idx)
        #-----------------------------------------
        deleteFiles([featurefile])
        feature_string = "+1"
        feature_string += processTree(tree, sec_ranker, sec_idx[0], False)
        writeToFile(featurefile, feature_string + '\n', 'a')
        deleteFiles([outfile])
        subprocess.call([classify, featurefile, model, outfile])
        with open(outfile, 'r') as ofile:
            sent_val = float(ofile.read().strip())
        if sent_val > 0:
            summary.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
            sum_len += len(doc[idx].sentence.encode('utf-8').split(' '))
        if sum_len > 130:
            break
        looper -= 1
        if looper == 0:
            print "Looper Done"
            break
    writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w')
    print '\n'.join(summary)