Beispiel #1
0
def generateTestFeatures(client_socket, infile, featurefile):
    # ------------------------------------------------
    doc = Document(infile)
    # ------------------------------------------------
    # Load pickle for label
    picklefile = DIR["DATA"] + "test-labels-pickle"
    global test_labels
    with open(picklefile, "rb") as pfile:
        test_labels = pickle.load(pfile)
    # ------------------------------------------------
    # For display and analysis
    dir, filename = os.path.split(infile)
    fcode = re.match(r"(.+)-parscit-section\.xml", filename).group(1)
    # ------------------------------------------------
    test_sents, sent_indices = getRankedSent(doc, fcode)
    # -----------------------------------------
    # Sectional Ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ""
        for key in sorted(block.keys()):
            sentences += str(block[key])
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sec_indices = sent2Section(doc, sent_indices)
    # -----------------------------------------
    for sentence, sent_idx, sec_idx in zip(test_sents, sent_indices, sec_indices):
        key = fcode + "-" + str(sent_idx)
        feature_string = test_data[key]["reallbl"]
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx)
        test_data[key]["depparse"] = getTree(tree)
        test_data[key]["features"] = feature_string
        writeToFile(featurefile, feature_string + "\n", "a")
Beispiel #2
0
def generateTestFeatures(client_socket, infile, featurefile):
    #------------------------------------------------
    doc = Document(infile)
    #------------------------------------------------
    # Load pickle for label
    picklefile = DIR['DATA'] + 'test-labels-pickle'
    global test_labels
    with open(picklefile, 'rb') as pfile:
        test_labels = pickle.load(pfile)
    #------------------------------------------------
    # For display and analysis
    dir, filename = os.path.split(infile)
    fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1)
    #------------------------------------------------
    test_sents, sent_indices = getRankedSent(doc, fcode)
    #-----------------------------------------
    # Sectional Ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sec_indices = sent2Section(doc, sent_indices)
    #-----------------------------------------
    for sentence, sent_idx, sec_idx in zip(test_sents, sent_indices,
                                           sec_indices):
        key = fcode + '-' + str(sent_idx)
        feature_string = test_data[key]['reallbl']
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx)
        test_data[key]['depparse'] = getTree(tree)
        test_data[key]['features'] = feature_string
        writeToFile(featurefile, feature_string + '\n', 'a')
Beispiel #3
0
def generateTrainFeatures(client_socket, infile, featurefile):
    #------------------------------------------------
    doc = Document(infile)
    all_sentences, all_offset = doc.all_sentences()
    #------------------------------------------------
    # Positive sentences
    pos_sents, offset = doc.section_sentences('abstract')
    sent_indices = range(offset, offset + len(pos_sents))
    #-----------------------------------------
    # Sectional Ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ''
        for key in sorted(block.keys()):
            sentences += (str(block[key]))
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sec_indices = sent2Section(doc, sent_indices)
    #-----------------------------------------
    # Count ranker
    #count_ranker = Ranker(all_sentences, tfidf=False)
    #-----------------------------------------
    for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices,
                                           sec_indices):
        feature_string = '+1'
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx)
        writeToFile(featurefile, feature_string + '\n', 'a')
    #------------------------------------------------
    # Negative sentences
    neg_ranker = TextRank(all_sentences)
    neg_ranker.rank()
    num = 5
    x = -1
    neg_sents = []
    sent_indices = []
    while num > 0:
        idx = neg_ranker.scores[x][0] + all_offset
        x -= 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_indices.append(idx)
            neg_sents.append(doc[idx].sentence.encode('utf-8'))
            num -= 1
    sec_indices = sent2Section(doc, sent_indices)
    #------------------------------------------------
    for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices,
                                           sec_indices):
        feature_string = '-1'
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx)
        writeToFile(featurefile, feature_string + '\n', 'a')
    #------------------------------------------------
    print "File processed to create feature vectors for training."
Beispiel #4
0
def generateTrainFeatures(client_socket, infile, featurefile):
    # ------------------------------------------------
    doc = Document(infile)
    all_sentences, all_offset = doc.all_sentences()
    # ------------------------------------------------
    # Positive sentences
    pos_sents, offset = doc.section_sentences("abstract")
    sent_indices = range(offset, offset + len(pos_sents))
    # -----------------------------------------
    # Sectional Ranker
    sections = []
    for sec, block in doc.document.items():
        sentences = ""
        for key in sorted(block.keys()):
            sentences += str(block[key])
        sections.append(sentences)
    sec_ranker = Ranker(sections)
    sec_indices = sent2Section(doc, sent_indices)
    # -----------------------------------------
    # Count ranker
    # count_ranker = Ranker(all_sentences, tfidf=False)
    # -----------------------------------------
    for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices, sec_indices):
        feature_string = "+1"
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx)
        writeToFile(featurefile, feature_string + "\n", "a")
    # ------------------------------------------------
    # Negative sentences
    neg_ranker = TextRank(all_sentences)
    neg_ranker.rank()
    num = 5
    x = -1
    neg_sents = []
    sent_indices = []
    while num > 0:
        idx = neg_ranker.scores[x][0] + all_offset
        x -= 1
        if not validSentence(doc[idx]):
            continue
        else:
            sent_indices.append(idx)
            neg_sents.append(doc[idx].sentence.encode("utf-8"))
            num -= 1
    sec_indices = sent2Section(doc, sent_indices)
    # ------------------------------------------------
    for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices, sec_indices):
        feature_string = "-1"
        tree = parseTrees(getDepParse(client_socket, sentence))
        feature_string += processTree(tree, sec_ranker, sec_idx)
        writeToFile(featurefile, feature_string + "\n", "a")
    # ------------------------------------------------
    print "File processed to create feature vectors for training."
Beispiel #5
0
for i in range(07):
    set = []
    for k in range(11):
        curr = choice(bucket)
        set.append(curr)
        bucket.remove(curr)
    all_sets.append(set)

for i in range(07):
    test_set = all_sets[i]
    train_set = []
    for set in [all_sets[z] for z in range(07) if z != i]:
        train_set.extend(set)
    for key in train_set:
        writeToFile(featurefile, data[key]['features'] + '\n', 'a')
    trainSvm(featurefile, model, gamma=1)
    predictSvm(featurefile, model, outfile)
    outstring = "Training Fold : " + str(i)
    print "************* " + outstring + " *************"
    analyze(featurefile, outfile, resfile, outstring)

    deleteFiles([featurefile, outfile])

    for key in test_set:
        writeToFile(featurefile, data[key]['features'] + '\n', 'a')
    predictSvm(featurefile, model, outfile)
    outstring = "Testing Fold : " + str(i)
    pre, rec = analyze(featurefile, outfile, resfile, outstring)
    precision.append(pre)
    recall.append(rec)
Beispiel #6
0
for i in range(07):
    set = []
    for k in range(11):
        curr = choice(bucket)
        set.append(curr)
        bucket.remove(curr)
    all_sets.append(set)

for i in range(07):
    test_set = all_sets[i]
    train_set = []
    for set in [all_sets[z] for z in range(07) if z != i]:
        train_set.extend(set)
    for key in train_set:
        writeToFile(featurefile, data[key]['features'] + '\n', 'a')
    trainSvm(featurefile, model, gamma=1)
    predictSvm(featurefile, model, outfile)
    outstring = "Training Fold : " + str(i)
    print "************* " + outstring + " *************"
    analyze(featurefile, outfile, resfile, outstring)

    deleteFiles([featurefile, outfile])

    for key in test_set:
        writeToFile(featurefile, data[key]['features'] + '\n', 'a')
    predictSvm(featurefile, model, outfile)
    outstring = "Testing Fold : " + str(i)
    pre, rec = analyze(featurefile, outfile, resfile, outstring)
    precision.append(pre)
    recall.append(rec)