コード例 #1
0
def main(transform_func = None, n = 10):
    parser=StanfordParser(
        path_to_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser.jar",
        path_to_models_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser-3.5.1-models.jar",
        model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
    )

    test_sents = treebank.sents()[-n:]

    print "len(test_sents) = %d" %(len(test_sents))

    if transform_func and callable(transform_func):
        print "transforming it using ", transform_func
        test_sents = [[transform_func(w) for w in s] 
                      for s in test_sents] # transform it

    print test_sents[:10]

    print "predicting"
    pred_parses = parser.parse_sents(test_sents)
    
    gold_parses = treebank.parsed_sents()
    
    print "evaluating"

    correct_n = gold_n = predicted_n = 0.0
    
    for gparse, pparse in zip(gold_parses, pred_parses):
        cn, gn, pn = precision_and_recall_stat(get_nodes_with_range(gparse), 
                                               get_nodes_with_range(pparse))
        correct_n += cn
        gold_n += gn
        predicted_n += pn
        
    print "Prediction: %f, Recall: %f" %(correct_n / predicted_n, correct_n / gold_n)
コード例 #2
0
def parseThisSents(sentences):
    sen10ses = []
    for i in sentences:
        sen10ses += [i.split()]

    parser = StanfordParser(path_to_models_jar=my_path_to_models_jar3,
                            path_to_jar=my_path_to_jar3)
    # c = list(parser.parse_sents(sentences))
    c = parser.parse_sents(sen10ses)
    A = list(c)
    phrasesList = []
    for i in range(len(A)):
        for B in A[i]:
            ##            print B
            for s in B.subtrees(
                    lambda B: B.label() == 'NP' or B.label() == 'VP'):
                phrasesList += [(" ".join(s.leaves()))]
    return phrasesList
コード例 #3
0
class SentenceCompress:
	def __init__(self):
		self.parser = StanfordParser()

	# takes an array of sentences (which are arrays of strings?)
	def syntax_parse(self, sentences):
		self.parsed_sentences = self.parser.parse_sents(sentences[:10]) # only testing w/ first 10
		for list_iter in self.parsed_sentences:
			for t in list_iter:
				print(t)
				self.traverse_tree(t)

	def word_significance(self, w):
		# I_j(w_i) =
		# tf_ij x idf_i if w_i is verb or common noun
		# tf_ij x idf_i + omega if w_i is proper noun
		# 0 otherwise
		pass

	def traverse_tree(self, tree):
		for node in tree:
			print(node)
			if type(node) == Tree:
				self.traverse_tree(node)
コード例 #4
0
ファイル: featuremaker.py プロジェクト: marjanhs/AA_CNN
class FeatureMaker:

    _sentence_data = None
    _split_data = None
    _stf_pos_tagger = None
    _stf_parser = None

    _pos_list = []
    _neg_list = []

    def __init__(self, data):
        self._split_data = data
        self._sentence_data = [" ".join(line) for line in self._split_data]

    def _pos_tag_sent(self, sent):
        # text = word_tokenize("And now for something completely different")
        return nltk.pos_tag(sent)

    def _sf_pos_tag_sent(self, sent):
        return self._stf_pos_tagger.tag(sent)

    def prefix_suffix(self):
        prefix_2 = []
        prefix_3 = []
        suffix_2 = []
        suffix_3 = []
        for line in self._split_data:
            prefix_2.append([w[:2] for w in line])
            prefix_3.append([w[:3] for w in line])
            suffix_2.append([w[-2:] for w in line])
            suffix_3.append([w[-3:] for w in line])

        return [prefix_2, prefix_3, suffix_2, suffix_3]

    def fast_pos_tag(self):
        tag_result = [[token[1] for token in self._pos_tag_sent(line)] for line in self._split_data]
        return tag_result

    def pos_tag(self):
        if self._stf_pos_tagger is None:
            self._stf_pos_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')
        index = 0
        tag_result = []
        while index < len(self._split_data):
            temp = self._stf_pos_tagger.tag_sents(self._split_data[index:index+1000])
            tag_result.extend(temp)
            index += 1000
            print(("pos:" + str(index)), end=' ')
        # tag_result = self._stf_pos_tagger.tag_sents(self._split_data)
        tag_result = [[unidecode(p[1]) for p in line] for line in tag_result]

        # for line in tag_result:
        #     print str(line)
        return tag_result

    def parser(self):
        if self._stf_parser is None:
            self._stf_parser = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
        result = self._stf_parser.parse_sents(self._split_data)
        result = sum([[parse for parse in dep_graphs] for dep_graphs in result], [])
        for i in result:
            print(i)

    def per_word_length(self):
        wl_result = [[len(w) for w in line] for line in self._split_data]
        return wl_result

    def sentence_avg_word_length(self):
        wl_result = self.per_word_length()
        wl_result = [np.mean(line) for line in wl_result]
        return wl_result

    def sentence_length(self):
        sl_result = [len(line) for line in self._split_data]
        return sl_result

    def sentence_length_mean_sd(self):
        return np.mean(self.sentence_length()), np.std(self.sentence_length())

    def load_sentiment_list(self):
        if not self._pos_list:
            with open("./../pos_neg/positive-words.txt", mode='r') as f:
                file_content = f.readlines()
                for line in file_content:
                    line = line.strip()
                    if not line.startswith(";") and line:
                        self._pos_list.append(line)
        if not self._neg_list:
            with open("./../pos_neg/negative-words.txt", mode='r') as f:
                file_content = f.readlines()
                for line in file_content:
                    line = line.strip()
                    if not line.startswith(";") and line:
                        self._neg_list.append(line)
        return [self._pos_list, self._neg_list]

    def sentiment_sequence(self):
        sentiment_data = []
        for line in self._split_data:
            sentiment_line = []
            for word in line:
                if word in self._pos_list:
                    sentiment_line.append("POS")
                elif word in self._neg_list:
                    sentiment_line.append("NEG")
                else:
                    sentiment_line.append("NON")
            sentiment_data.append(sentiment_line)
        return sentiment_data

    def get_read_measure(self):
        value_list = []
        for cat, data in list(readability.getmeasures(self._sentence_data, lang='en').items()):
            print(('%s:' % cat))
            for key, val in list(data.items()):
                print((('    %-20s %12.2f' % (key + ':', val)).rstrip('0 ').rstrip('.')))

            value_list.append(val)
        return val
コード例 #5
0
ファイル: util.py プロジェクト: apple/ml-cread
def parse_key_file(key_file):
    try:
        from nltk.parse.stanford import StanfordParser
        parser = StanfordParser(
            model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
            java_options='-Xmx8G')
        print("Starting to parse key_file!")
        print("This might take a while...")
        new_file = open(key_file + ".parsed", "w")
        with open(key_file) as f:
            tmp_sentence = [[]]
            tmp_conll_lines = []
            for line in f:
                if line.startswith("#begin"):
                    new_file.write(line)
                    continue
                elif len(line.strip()) == 0 or (line.startswith("#end")
                                                and len(tmp_conll_lines) > 0):
                    parse = parser.parse_sents(tmp_sentence)
                    for tree in parse:
                        for tree_line in tree:  #line is a Tree
                            parse_string = ' '.join(str(tree_line).split())
                            treecomp = parse_string.split()
                            currlowestindex = 0
                            token_index = 0
                            for idx, val in enumerate(treecomp):
                                if not val.startswith("("):
                                    firstindexofbracket = val.index(")")
                                    lastindex = val.__len__() - 1
                                    tag_components = []
                                    pos_tag = treecomp[idx - 1].replace(
                                        "(", "")
                                    if currlowestindex == idx - 1:
                                        if firstindexofbracket == lastindex:
                                            tag_components.append("*")
                                        else:
                                            parsecol = "*" + val[
                                                firstindexofbracket:lastindex]
                                            tag_components.append(parsecol)
                                    else:
                                        for i in range(currlowestindex,
                                                       idx - 1):
                                            tag_components.append(treecomp[i])
                                        if firstindexofbracket == lastindex:
                                            tag_components.append("*")
                                        else:
                                            parsecol = "*" + val[
                                                firstindexofbracket:lastindex]
                                            tag_components.append(parsecol)
                                    currlowestindex = idx + 1

                                    new_file.write(
                                        '\t'.join(tmp_conll_lines[token_index].
                                                  split()[0:4]) + "\t" +
                                        pos_tag + "\t" +
                                        ''.join(tag_components) + '\t' +
                                        '\t'.join(tmp_conll_lines[token_index].
                                                  split()[4:]) + '\n')
                                    token_index += 1

                    tmp_sentence[0] = []
                    tmp_conll_lines = []
                    new_file.write("\n")

                elif not line.startswith("#"):
                    word = line.split()[3]
                    word_uc = word  #.decode(encoding='UTF-8')
                    tmp_sentence[0].append(word_uc)
                    tmp_conll_lines.append(line)
                if line.startswith("#end"):
                    new_file.write(line)

    except:
        print(
            "You need to set the CLASSPATH environment variable to point to the Stanford parser!"
        )
        print(
            "Example: export CLASSPATH=/path/to/stanford-parser-full-YYYY-MM-DD/stanford-parser.jar:/path/to/stanford-parser-full-YYYY-MM-DD/stanford-parser-X.X.X-models.jar"
        )
        print("")
        raise
コード例 #6
0
ファイル: end_to_end.py プロジェクト: griff4692/ml_projects
def main(in_path, outpath):
    nltk.download()

    span_extractor = torch.load(os.path.join(EXPERIMENT,
                                             'best_span_extractor.tar'),
                                map_location='cpu')
    answer_verifier = torch.load(os.path.join(EXPERIMENT,
                                              'best_answer_verifier.tar'),
                                 map_location='cpu')
    span_extractor.use_cuda = False
    answer_verifier.use_cuda = False

    tokenizer = StanfordTokenizer(
        options={'ptb3Escaping':
                 True})  # same tokenizer used by lexical parser
    parser = StanfordParser(java_options='-mx5g')

    data = json.load(open(in_path, 'r'))['data']
    batches = []
    official_eval = {}
    official_eval_tokens = {}
    qaid_map = {}

    num_articles = len(data)
    for aidx in range(len(data)):
        article = data[aidx]
        print('\t- Article Count=%d/%d' % (aidx + 1, num_articles))
        for pidx, paragraph in enumerate(article['paragraphs']):
            passage, qas = paragraph['context'], paragraph['qas']
            passage = passage.replace(u'\xa0', ' ')
            sentences = sent_tokenize(passage)

            sentence_tokens = [
                tokenizer.tokenize(sentence) for sentence in sentences
            ]
            raw_trees = [
                list(s)[0] for s in list(
                    parser.parse_sents(sentence_tokens, verbose=True))
            ]
            squad_tree = TreePassage(raw_trees)

            for qidx, qa in enumerate(qas):
                question_sentences = sent_tokenize(qa['question'])
                question_tokens = []
                for s in question_sentences:
                    question_tokens += tokenizer.tokenize(s)

                batches.append(
                    Batch([{
                        'apid': 'apid',
                        'qa_id': qa['id'],
                        'context_squad_tree': squad_tree,
                        'question_tokens': question_tokens,
                        'answers': [],
                        'is_impossible': 0
                    }], False))

                qaid_map[qa['id']] = paragraph['context']

    span_extractor.eval()
    answer_verifier.eval()
    for idx, batch in enumerate(batches):
        qa_id = batch.qa_id[0]

        node_scores, expected_f1s, global_answer_score = span_extractor(
            batch, eval_system=True)
        score_confidence, predicted_node_idxs = node_scores.max(dim=1)
        score_confidence, predicted_node_idxs = (variable_to_numpy(
            score_confidence,
            False), variable_to_numpy(predicted_node_idxs, False))

        # Answer score = predicted has answer probability
        answer_score = answer_verifier(batch,
                                       predicted_node_idxs=predicted_node_idxs,
                                       eval_system=True)
        answer_proba = variable_to_numpy(
            Sigmoid()(answer_score),
            False)  # convert from tensor to numpy array
        global_answer_proba = variable_to_numpy(Sigmoid()(global_answer_score),
                                                False)

        has_answer_proba = (0.3 * score_confidence +
                            0.4 * global_answer_proba + 0.3 * answer_proba)[0]

        predicted_span = batch.trees[0].span(predicted_node_idxs[0])
        predicted_has_answer = has_answer_proba >= HAS_ANSWER_THRESHOLD

        predicted_text = tokens_to_text(predicted_span, qaid_map[qa_id])
        official_eval[qa_id] = predicted_text if predicted_has_answer else ''
        official_eval_tokens[qa_id] = ' '.join(
            predicted_span) if predicted_has_answer else ''

    json.dump(official_eval, open(outpath, 'w'))