Example #1
0
def convert_trees(input_file, output_file):
    import StanfordDependencies
    sd = StanfordDependencies.get_instance(backend='jpype')
    with open(input_file, 'r') as inpf, open(output_file, 'w') as outf:
        for line_no, line in enumerate(inpf):
            if line_no % 1000 == 0:
                print("Processing sentence pair {}.".format(line_no))
            sentence = json.loads(line)
            prem = sentence.get('sentence1_parse')
            hypo = sentence.get('sentence2_parse')
            prem_dep = ' '.join([
                '{}({}({}({}'.format(x.index, x.form, x.head, x.deprel)
                for x in sd.convert_tree(prem)
            ])
            hypo_dep = ' '.join([
                '{}({}({}({}'.format(x.index, x.form, x.head, x.deprel)
                for x in sd.convert_tree(hypo)
            ])
            sentence.update({
                'sentence1_dependency_parse': prem_dep,
                'sentence2_dependency_parse': hypo_dep
            })
            outf.write(json.dumps(sentence) + "\n")

    print("Wrote file with dependency parse annotation to {}".format(
        output_file))
Example #2
0
def parse_reports(data_path, sheet_name, file_path):

    report_data_file = xlrd.open_workbook(data_path)
    sheet = report_data_file.sheet_by_name(sheet_name)

    rrp = RerankingParser.fetch_and_load('GENIA+PubMed', verbose=True)
    sd = StanfordDependencies.get_instance(backend='subprocess')

    for i in range(910, 3852):
        finding = sheet.cell(i, 6).value
        with open(file_path, mode='a') as f:
            f.write('finding no.' + str(i))
            f.write('\n')
        sent_tokenize_list = sent_tokenize(finding)
        for j in range(len(sent_tokenize_list)):
            try:
                with open(file_path, mode='a') as f:
                    f.write('sentence no.' + str(j))
                    f.write('\n')
                sentence = sent_tokenize_list[j]
                tree = rrp.simple_parse(sentence)
                dependencies = sd.convert_tree(tree)
                for token in dependencies:
                    with open(file_path, mode='a') as f:
                        f.write(str(token))
                        f.write('\n')
            except:
                print('error!')
                with open(file_path, mode='a') as f:
                    f.write('error!!!')
                    f.write('\n')
    def __init__(self,
                 lang={
                     'spacy': 'en',
                     'benepar': 'benepar_en2'
                 },
                 config=None):
        super().__init__()
        self.download = False
        # Checking if NLTK sentence and word tokenizers should be downloaded
        if not config_berkeley_nlp['benepar_sent_word_tok_downloaded']:
            spacy.load(lang['spacy'])
            config_global['config_benepar'][
                'benepar_sent_word_tok_downloaded'] = True
            self.download = True
        # Checking if parsing model should be downloaded
        if not config_berkeley_nlp['parsing_model_downloaded']:
            benepar.download(lang['benepar'])
            config_global['config_benepar']['parsing_model_downloaded'] = True
            self.download = True
        # Updating yaml file if necessary
        if self.download:
            with open("./config.yaml", "w") as f:
                yaml.dump(config_global, f)

        self.nlp = spacy.load(lang['spacy'])
        self.nlp.add_pipe(BeneparComponent(lang['benepar']))
        self.sd = StanfordDependencies.get_instance(
            backend='subprocess')  # to convert trees
        self.name_save = 'benepar'
def corenlpLemmaPOS_stanfordparserDependency_split_equalChecking():
    ## corenlp setting
    corenlp_dir = "stanford-corenlp-full-2014-08-27/"
    corenlp = StanfordCoreNLP(corenlp_dir)
    ## stanfordDependencies setting
    sd = StanfordDependencies.get_instance(backend="subprocess", version="3.4.1")
    os.environ["STANFORD_PARSER"] = "stanford-parser-full-2014-08-27/"
    os.environ["STANFORD_MODELS"] = "stanford-parser-full-2014-08-27/"
    parser = stanford.StanfordParser(
        model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
    )

    with open("../../dataclean_Nov8_2015/train_afterdataclean_modifiedcleanedTupleNov8.json") as t:
        trainTup = json.load(t)
    for num, tup in enumerate(trainTup):
        ## after modify col8 and save, col8 now may be empty..
        if not tup[8]:
            continue
        ## use corenlp to split sentence
        print "No.", num
        print tup[8]
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        slist = par["sentences"][0]["words"]
        print slist
        temp = []
        for s in slist:
            temp.append(s[0])
        print temp
        ## use stanfordDependencies to do split sentence
        sentences = parser.raw_parse(tup[8])
        s = ""
        for line in sentences:
            for sentence in line:
                s += str(sentence)

        sent = sd.convert_tree(s)
        print sent
        detemp = []
        for t in sent:
            detemp.append(t[1])
        print detemp
        for di, ti in zip(detemp, temp):
            if di == ti:
                pass
            else:
                if (
                    (ti == "(" and di == "-LRB-")
                    or (ti == ")" and di == "-RRB-")
                    or (ti == "[" and di == "-LSB-")
                    or (ti == "]" and di == "-RSB-")
                ):
                    print "diff in parenthesis"
                    pass
                else:
                    print "{", di, " ,", ti, " }"
Example #5
0
def main():
    if len(sys.argv) < 3:
        print("Usage: path_to_genia path_to_conllu")
        return

    genia_path = sys.argv[1]
    conllu_path = sys.argv[2]

    sd = StanfordDependencies.get_instance()

    with open(conllu_path, mode='w', encoding='utf-8') as conllu:
        convert_genia(genia_path, conllu, sd)
def f30_usingCorenlpSplit(dataset_,f30file_):
    ## stanfordDependencies setting
    sd = StanfordDependencies.get_instance(backend="subprocess",version='3.4.1')
    os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2014-08-27/'
    os.environ['STANFORD_MODELS'] = 'stanford-parser-full-2014-08-27/'
    parser = stanford.StanfordParser(model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

    ## load dataset
    with open(dataset_) as f:
        data = json.load(f)

    ## record how many sentence split unequal
    sentSplitUnequal = 0
    ## structure to save {word:abc,gramatical relationship: xx}
    res = []
    for num,tupl in enumerate(data):
        if not tupl[3]:
            continue
        newStr = ' '.join(tupl[3])
        print num,newStr
        
        sentences = parser.raw_parse(newStr)
        s=""
        for line in sentences:
            for sentence in line:
                s+=str(sentence)

        sent = sd.convert_tree(s)
        
        tem = [] ## a list of the depent split words
        temp_gram = [] ## record gramatical relationship
        for t in sent:
            detemp = {}
            detemp['Word'] = t[1]
            detemp['Grammatical relation'] = t[7]
           
            temp_gram.append(detemp)
           
            tem.append(t[1])
        if tem != tupl[3]:
            print 'depen split:',tem
            print 'corenlp split:',tupl[3]
            sentSplitUnequal += 1
            ## record index
            print 'unequal sentence No.',num
            print 'No. of sentence:',num,'begin index:',len(res),'end index:',len(res)+len(tupl[3])

        else:
            
            res = res+temp_gram
    with open(f30file_,'w') as l:
        json.dump(res,l)
    print sentSplitUnequal
def convert_to_sd(scored_parse, sd_converter=None, representation='CCprocessed'):
    """Converts parse tree to Stanford dependencies.

    :param scored_parse: ScoredParse object in RerankingParser
    :type scored_parse: ScoredParse
    :param sd_converter: StanfordDependencies converter
    :type sd_converter: StanfordDependencies
    :param representation: one of basic, collapsed, CCProcessed
    :type representation: str
    :return: array of tokens
    :rtype: Token
    """
    sd_converter = sd_converter or StanfordDependencies.get_instance()
    return sd_converter.convert_tree(str(scored_parse.ptb_parse), representation=representation)
def checkCoreNLPSplit_DependencySplit(file_):
    with open(file_) as f:
        tset = json.load(f)
    ## corenlp setting
    corenlp_dir = "stanford-corenlp-full-2014-08-27/"
    corenlp = StanfordCoreNLP(corenlp_dir)
    
    ## stanfordDependencies setting
    sd = StanfordDependencies.get_instance(backend="subprocess",version='3.4.1')
    os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2014-08-27/'
    os.environ['STANFORD_MODELS'] = 'stanford-parser-full-2014-08-27/'
    parser = stanford.StanfordParser(model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    
    for num, tup in enumerate(tset):
        print num
        if not tup[8]:
            continue
        ## use corenlp to splitup
        res = corenlp.parse(tup[8])
        par = json.loads(res)
        slist = par["sentences"][0]['words']
        temp = []
        for s in slist:
            temp.append(s[0])
        
        ## use stanfordDependencies to do split sentence
        sentences = parser.raw_parse(tup[8])
        s=""
        for line in sentences:
            for sentence in line:
                s+=str(sentence)

        sent = sd.convert_tree(s)
        
        detemp = []
        for t in sent:
            detemp.append(t[1])
            
        ## check if same

        for di,ti in zip(detemp,temp):
            if di == ti:
                pass
            else:
                if (ti == '(' and di == '-LRB-') or (ti == ')' and di == '-RRB-') or (ti == '[' and di == '-LSB-') or (ti == ']' and di == '-RSB-'):
                    print "diff in parenthesis"
                    pass
                else:
                    print "!!!"
                    print "{",di,' ,',ti," }"
Example #9
0
    def __init__(self, document_as_string):
        """ Construct a document from a string representation.

            The Format must follow the CoNLL format, see
                http://conll.cemantix.org/2012/data.html.

            Args:
                document_as_string (str): A representation of a document in
                    the CoNLL format.
            """
        identifier = " ".join(document_as_string.split("\n")[0].split(" ")[2:])
        print(identifier)

        self.document_table = CoNLLDocument.__string_to_table(
            document_as_string)
        in_sentence_ids = [int(i) for i in self.__extract_from_column(2)]
        indexing_start = in_sentence_ids[0]
        if indexing_start != 0:
            logger.warning("Detected " +
                           str(indexing_start) +
                           "-based indexing for tokens in sentences in input,"
                           "transformed to 0-based indexing.")
            in_sentence_ids = [i - indexing_start for i in in_sentence_ids]
        sentence_spans = CoNLLDocument.__extract_sentence_spans(in_sentence_ids)
        temp_tokens = self.__extract_from_column(3)
        temp_pos = self.__extract_from_column(4)
        temp_ner = self.__extract_ner()
        temp_speakers = self.__extract_from_column(9)
        coref = CoNLLDocument.__get_span_to_id(self.__extract_from_column(-1))
        parses = [CoNLLDocument.get_parse(span,
                                          self.__extract_from_column(5),
                                          temp_pos,
                                          temp_tokens)
                  for span in sentence_spans]
        sd = StanfordDependencies.get_instance()
        dep_trees = sd.convert_trees(
            [parse.replace("NOPARSE", "S") for parse in parses],include_erased=True
        )
        sentences = []
        for i, span in enumerate(sentence_spans):
            sentences.append(
                (temp_tokens[span.begin:span.end + 1],
                 temp_pos[span.begin:span.end + 1],
                 temp_ner[span.begin:span.end + 1],
                 temp_speakers[span.begin:span.end + 1],
                 parses[i],
                 dep_trees[i])
            )

        super(CoNLLDocument, self).__init__(identifier, sentences, coref)
Example #10
0
    def __init__(self, document_as_string):
        """ Construct a document from a string representation.

            The Format must follow the CoNLL format, see
                http://conll.cemantix.org/2012/data.html.

            Args:
                document_as_string (str): A representation of a document in
                    the CoNLL format.
            """
        identifier = " ".join(document_as_string.split("\n")[0].split(" ")[2:])

        self.document_table = CoNLLDocument.__string_to_table(
            document_as_string)
        in_sentence_ids = [int(i) for i in self.__extract_from_column(2)]
        indexing_start = in_sentence_ids[0]
        if indexing_start != 0:
            logger.warning("Detected " +
                           str(indexing_start) +
                           "-based indexing for tokens in sentences in input,"
                           "transformed to 0-based indexing.")
            in_sentence_ids = [i - indexing_start for i in in_sentence_ids]
        sentence_spans = CoNLLDocument.__extract_sentence_spans(in_sentence_ids)
        temp_tokens = self.__extract_from_column(3)
        temp_pos = self.__extract_from_column(4)
        temp_ner = self.__extract_ner()
        temp_speakers = self.__extract_from_column(9)
        coref = CoNLLDocument.__get_span_to_id(self.__extract_from_column(-1))
        parses = [CoNLLDocument.get_parse(span,
                                          self.__extract_from_column(5),
                                          temp_pos,
                                          temp_tokens)
                  for span in sentence_spans]
        sd = StanfordDependencies.get_instance()
        dep_trees = sd.convert_trees(
            [parse.replace("NOPARSE", "S") for parse in parses],
        )
        sentences = []
        for i, span in enumerate(sentence_spans):
            sentences.append(
                (temp_tokens[span.begin:span.end + 1],
                 temp_pos[span.begin:span.end + 1],
                 temp_ner[span.begin:span.end + 1],
                 temp_speakers[span.begin:span.end + 1],
                 parses[i],
                 dep_trees[i])
            )

        super(CoNLLDocument, self).__init__(identifier, sentences, coref)
Example #11
0
 def __init__(self, representation='CCprocessed', universal=False):
     """
     Args:
         representation(str): Currently supported representations are
             'basic', 'collapsed', 'CCprocessed', and 'collapsedTree'
         universal(bool): if True, use universal dependencies if they're available
     """
     try:
         import jpype
         self._backend = 'jpype'
     except ImportError:
         self._backend = 'subprocess'
     self._sd = StanfordDependencies.get_instance(backend=self._backend)
     self.representation = representation
     self.universal = universal
Example #12
0
 def sd_tokens(self, sd_converter=None, conversion_kwargs=None):
     """Convert this Tree to Stanford Dependencies
     (requires PyStanfordDependencies). Returns a list of
     StanfordDependencies.Token objects. This method caches
     the converted tokens. You may optionally specify a
     StanfordDependencies instance in sd_converter and keyword
     arguments to StanfordDependencies.convert_tree as a dictionary
     in conversion_kwargs."""
     if not self._sd_tokens:
         try:
             import StanfordDependencies
         except ImportError:
             raise ImportError("For sd_tokens(), you need to install"
                               "PyStanfordDependencies from PyPI")
         sd_converter = sd_converter or StanfordDependencies.get_instance()
         conversion_kwargs = conversion_kwargs or {}
         self._sd_tokens = sd_converter.convert_tree(
             str(self), **conversion_kwargs)
     return self._sd_tokens
Example #13
0
 def sd_tokens(self, sd_converter=None, conversion_kwargs=None):
     """Convert this Tree to Stanford Dependencies
     (requires PyStanfordDependencies). Returns a list of
     StanfordDependencies.Token objects. This method caches
     the converted tokens. You may optionally specify a
     StanfordDependencies instance in sd_converter and keyword
     arguments to StanfordDependencies.convert_tree as a dictionary
     in conversion_kwargs."""
     if not self._sd_tokens:
         try:
             import StanfordDependencies
         except ImportError:
             raise ImportError("For sd_tokens(), you need to install"
                               "PyStanfordDependencies from PyPI")
         sd_converter = sd_converter or StanfordDependencies.get_instance()
         conversion_kwargs = conversion_kwargs or {}
         self._sd_tokens = sd_converter.convert_tree(str(self),
                                                     **conversion_kwargs)
     return self._sd_tokens
def convert_tree(line, entities, sent_id):
    print ' convert_tree with '+line
    sd = StanfordDependencies.get_instance(
            jar_filename='/root/xhong/stanford/stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2.jar',
            backend='subprocess')
    
    #ex='(ROOT(S(NP (PRP$ My) (NN dog))(ADVP (RB also))(VP (VBZ likes)(S(VP (VBG eating)(NP (NN sausage)))))(. .)))'
    #dependencies = sd.convert_tree(ex, debug=True)
    
    idx = 0
    #returns a list of sentences (list of list of Token objects) 
    dependencies = sd.convert_tree(line, debug=True)

    for token in dependencies:
        print token
        if token.pos in nouns :
            print ' .. is a noun-'+token.pos
            grammatical_role = '-'
            if token.deprel in subject: 
                grammatical_role = 'S'
            elif token.deprel in object:
                grammatical_role = 'O'
            else:
                grammatical_role = 'X'
            
            # print token.form
            token_lemma = wnl.lemmatize(token.form, get_POS(token.pos))
            print token.form, token_lemma
            ''' if this entity has already occurred in the sentence, store the reference with highest grammatical role , 
            judged here  as S > O > X '''
            if token_lemma in entities and entities[token_lemma][sent_id] :
                print str(entities[token_lemma][sent_id]) + ' comparing to '+str(r2i[grammatical_role])
                if (entities[token_lemma][sent_id]) < r2i[grammatical_role]:
                    entities[token_lemma][sent_id] = r2i[grammatical_role]
            else:
                entities[token_lemma][sent_id] = r2i[grammatical_role]
            ''' entity->list of : sentence_number->grammatical_role'''
        idx +=1

    # print entities
    return entities, idx
Example #15
0
def convert_tree(line, entities):
    print ' convert_tree with '+line
    sd = StanfordDependencies.get_instance(
            jar_filename='C:\SMT\StanfordNLP\stanford-corenlp-full-2013-11-12\stanford-corenlp-full-2013-11-12\stanford-corenlp-3.3.0.jar',
            backend='subprocess')
    
    #ex='(ROOT(S(NP (PRP$ My) (NN dog))(ADVP (RB also))(VP (VBZ likes)(S(VP (VBG eating)(NP (NN sausage)))))(. .)))'
    #dependencies = sd.convert_tree(ex, debug=True)
    
    idx = 0
    #returns a list of sentences (list of list of Token objects) 
    dependencies = sd.convert_tree(line, debug=True)
    
    for token in dependencies:
        print token
        if token.pos in nouns :
            print ' .. is a noun-'+token.pos
            grammatical_role = '-'
            if token.deprel in subject: 
                grammatical_role = 'S'
            elif token.deprel in object:
                grammatical_role = 'O'
            else:
                grammatical_role = 'X'
            
            ''' if this entity has already occurred in the sentence, store the reference with highest grammatical role , 
            judged here  as S > O > X '''
            if token.lemma in entities and  entities[token.lemma][idx] :
                print str(entities[token.lemma][idx]) + ' comparing to '+str(r2i[grammatical_role])
                if (entities[token.lemma][idx]) < r2i[grammatical_role]:
                    entities[token.lemma][idx] = r2i[grammatical_role]
            else:
                entities[token.lemma][idx] = r2i[grammatical_role]
            ''' entity->list of : sentence_number->grammatical_role'''
    idx +=1
    return entities, idx
Example #16
0
def f30(file_,wfile_):
    ## stanfordDependencies setting
    sd = StanfordDependencies.get_instance(backend="subprocess",version='3.4.1')
    os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2014-08-27/'
    os.environ['STANFORD_MODELS'] = 'stanford-parser-full-2014-08-27/'
    parser = stanford.StanfordParser(model_path="stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    resDepent = []
    ## loaddataset
    with open(file_) as t: 
        dataset = json.load(t)

    for num, tup in enumerate(dataset):
        print 'No.'+str(num)+ ": "+ tup[8]
        if not tup[8]:
            continue
        
        ## use stanfordDependencies to do split sentence
        sentences = parser.raw_parse(tup[8])
        s=""
        for line in sentences:
            for sentence in line:
                s+=str(sentence)

        sent = sd.convert_tree(s)
        
        
        for t in sent:
            detemp = {}
            detemp['Word'] = t[1]
            detemp['Grammatical relation'] = t[7]
            print detemp
            resDepent.append(detemp)

    with open(wfile_,'w') as u:
        json.dump(resDepent,u)
    print len(resDepent)
#feature prepare
# f19; fill entail set
stop = stopwords.words('english')
# entailLst = entailfeaturePrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/reverb_local_global/Resource0812/reverb_local_clsf_all.txt')
# HedgeLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/hedges_hyland2005.txt','r','utf-8') if ('#' not in line)])
# FactiveLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/factives_hooper1975.txt','r','utf-8') if ('#' not in line)])
# AssertiveLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/assertives_hooper1975.txt','r','utf-8') if ('#' not in line)])
# ImplicativeLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/implicatives_karttunen1971.txt','r','utf-8') if ('#' not in line)])
# ReportLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/report_verbs.txt','r','utf-8') if ('#' not in line)])
# StrongSubjLst = subjectivePrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff','strongsubj')
# WeakSubjLst = subjectivePrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff','weaksubj')
# PolarityDict = polarityPrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff')
# PosiveLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/opinion-lexicon-English/positive-words.txt','r','utf-8') if (';' not in line)])
# NegativeLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/opinion-lexicon-English/negative-words.txt','r') if (';' not in line)])
sd = StanfordDependencies.get_instance(backend="subprocess",version='3.4.1')
os.environ['STANFORD_PARSER'] = '/Users/wxbks/Downloads/stanford-parser-full-2014-08-27/'
os.environ['STANFORD_MODELS'] = '/Users/wxbks/Downloads/stanford-parser-full-2014-08-27/'
parser = stanford.StanfordParser(model_path="/Users/wxbks/Downloads/stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

# os.environ['STANFORD_PARSER'] = '/Users/wxbks/Downloads/stanford-parser-2012-11-12/'
# os.environ['STANFORD_MODELS'] = '/Users/wxbks/Downloads/stanford-parser-2012-11-12/'
# parser = stanford.StanfordParser()
# parser = stanford.StanfordParser(model_path="/Users/wxbks/Downloads/stanford-parser-2012-11-12/stanford-parser-2.0.4-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
# npovdict = {}
# with open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/npov-edits/npov_words_lemma_2_downcase.json') as t:
#     npovdict = json.load(t)
# print HedgeLst
freqArtDict = {}

labels = []
Example #18
0
 def __init__(self, model_name=None):
     if model_name is None:
         model_name = 'GENIA+PubMed'
     self.model_name = model_name
     self.sd = StanfordDependencies.get_instance()
Example #19
0
def analyser():
    sd = StanfordDependencies.get_instance(backend='subprocess')
    sent = sd.convert_tree(tree_maker('Hari was a great dancer.'))
    for token in sent:
        print(token)
Example #20
0
def getDependency(dependencyInput):
    parsedtext = rrp.simple_parse(dependencyInput)
    # Parse the PENN TreeBank format text using Stanford Dependency Parsing
    sd = StanfordDependencies.get_instance(backend='subprocess')
    sent = sd.convert_tree(parsedtext)
    return sent
Example #21
0
    # test = f_test.readlines()
    
    # cnt = Counter()
    # relations = IndexDict()
    # vol = IndexDict()

    # os.environ['STANFORD_PARSER'] = STANFORD_DIR + "stanford-parser.jar"
    # os.environ['STANFORD_MODELS'] = STANFORD_DIR + "stanford-parser-3.3.0-models.jar"

    # parser = stanford.StanfordParser(model_path= STANFORD_DIR + "englishPCFG.ser.gz")
    # st = (" ".join(nltk.word_tokenize("Hello, My (name) is Melroy."))).replace("(", "-LRB-").replace(")","-RRB-")
    # sentences = parse_tokenized_sentences(parser, [st])
    # print st
    # print sentences

    sd = StanfordDependencies.get_instance(version='3.3.0')

    # print [sd.convert_tree(s._pprint_flat(nodesep='', parens='()', quotes=False)) for s in sentences]


    # arg1s = []
    # arg2s = []
    # ind = 0
    # for line in train:
    #     ind += 1
    #     print ind
    #     args = line.split("||||")
    #     arg1s.append(((" ".join( [w for w in nltk.word_tokenize(args[0].strip()) if len(w) > 0] )).strip()).replace("(", "-LRB-").replace(")","-RRB-").encode('utf-8') )
    #     arg2s.append(((" ".join( [w for w in nltk.word_tokenize(args[1].strip()) if len(w) > 0] )).strip()).replace("(", "-LRB-").replace(")","-RRB-").encode('utf-8') )

    # print len(arg1s), len(arg2s)
Example #22
0
def _main(args):

    if args.deps:
        import StanfordDependencies
        dep = StanfordDependencies.get_instance(backend='subprocess')

    delta = args.delta
    assert 0 <= delta <= 1

    T = {}
    P = {}
    check_llh = 1

    color = {name: c for name, c in zip(sorted(P), 'rgbym' * len(P))}
    marker = {name: 'o' for name in P}

    if args.experiment not in ('grammars', ):
        # benchmark default parser
        if args.grammar == 'medium':
            grammar_file = 'data/medium'
            g = Grammar.load('data/medium')
        elif args.grammar == 'big':
            grammar_file = 'data/bubs/wsj_6'
        chomsky = False
        g = Grammar.load(grammar_file)

    if args.experiment == 'default-parser':
        P['lchild'] = Parser(leftchild, g, chomsky=0)

    elif args.experiment == 'grammar-loops':
        # Experiment: grammar loops
        P['lcbptr'] = Parser(leftchild_bp, g, chomsky=chomsky)
        P['lchild'] = Parser(leftchild, g, chomsky=chomsky)
        #P['x-prod'] = Parser(xprod, g, chomsky=chomsky)
        #P['agenda'] = AgendaParser(g, chomsky=chomsky)

    elif args.experiment == 'grammars':
        #P, color, marker = _leftchild_v_dense_yj_on_many_grammars()
        P, color, marker = _many_grammars()
        check_llh = False
    else:
        raise ValueError('Fail to recognize experiment %r' % args.experiment)

    T = {x: Timer(x) for x in P}
    overall = []
    errors = []

    examples = ptb(args.fold,
                   minlength=3,
                   maxlength=args.maxlength,
                   n=args.examples)

    if 1:
        examples = list(examples)
        np.random.shuffle(examples)

    _evalb_gold = {}
    _evalb_pred = {}
    for k, p in enumerate(P):
        _evalb_gold[p] = open('tmp/evalb-%s.gold' % k, 'wb')
        _evalb_pred[p] = open('tmp/evalb-%s.pred' % k, 'wb')

    if args.policy:
        from ldp.prune.features import Features
        theta = np.load(args.policy)['coef']

        policy_grammar = Grammar.load(
            'data/bubs/wsj_6')  # FIXME: shouldn't be hardcoded!
        F = Features(policy_grammar, nfeatures=2**22)

    for i, (s, t) in enumerate(examples):
        print
        print green % 'Example: %s, length: %s' % (i, len(s.split()))
        print yellow % s

        e = Example(s, grammar=None, gold=t)
        sentence = e.tokens
        N = e.N

        if args.policy:
            e.tokens = policy_grammar.encode_sentence(e.sentence.split())
            keep = F.mask(e, theta)

        else:
            # don't prune anything
            keep = np.ones((N, N + 1), dtype=np.int)
            for x in e.nodes:
                keep[x] = np.random.uniform(0, 1) <= delta
            for x in e.gold_spans:
                keep[x] = 1

        data = []

        #ugold = Tree.fromstring(e.gold_unbinarized)

        if args.deps:
            dep_gold = dep.convert_tree(
                e.gold_unbinarized,
                universal=0)  # TODO: include function tags???
            dep_unlabel_gold = {(z.index, z.head) for z in dep_gold}
            dep_label_gold = {(z.index, z.deprel, z.head) for z in dep_gold}

        for parser in sorted(P):

            b4 = time()

            with T[parser]:
                state = P[parser](e, keep)

            wallclock = time() - b4

            s = state.likelihood
            d = state.derivation
            pops = state.pops
            pushes = state.pushes

            ucoarse = P[parser].decode(e, d)

            #            print
            #            print parser
            #            print ucoarse

            # write gold and predicted trees to files so we can call evalb
            print >> _evalb_gold[parser], e.gold_unbinarized
            print >> _evalb_pred[parser], oneline(ucoarse)

            GW, G, W = evalb_unofficial(e.gold_unbinarized, binarize(ucoarse))
            h = cgw_f(GW, G, W)
            #            h = evalb(e.gold_unbinarized, ucoarse)

            row = {
                'name': parser,
                'llh': s,
                'sentence': sentence,
                'N': N,
                #'tree': tree,
                'evalb': h,
                'GotWant': GW,
                'Got': G,
                'Want': W,
                'pops': pops,
                'pushes': pushes,
                'wallclock': wallclock
            }

            if args.deps:
                # TODO: include function tags? What is the correct way to get target trees?
                dep_parse = dep.convert_tree(oneline(ucoarse), universal=0)
                dep_label = {(z.index, z.deprel, z.head) for z in dep_parse}
                dep_unlabel = {(z.index, z.head) for z in dep_parse}

                # TODO: Use the official eval.pl script from CoNLL task.
                UAS = len(dep_unlabel & dep_unlabel_gold) / e.N
                LAS = len(dep_label & dep_label_gold) / e.N
                row['LAS'] = LAS
                row['UAS'] = UAS

            data.append(row)
            overall.append(row)

        df = DataFrame(overall).groupby('name').mean()
        #df['wallclock'] = sum_df.wallclock  # use total time

        df.sort_values('wallclock', inplace=1)
        df['speedup'] = df.wallclock.max() / df.wallclock
        df['wps'] = df['N'] / df['wallclock']  # ok to use avg instead of sum

        # Determine which columns to display given command-line options.
        show_cols = [
            'evalb_corpus', 'wallclock', 'wps', 'speedup', 'pushes', 'pops',
            'LAS', 'UAS'
        ]
        if len(P) == 1:
            show_cols.remove('speedup')
        if not args.deps:
            show_cols.remove('LAS')
            show_cols.remove('UAS')

        def foo(df):
            "Add column"
            s = DataFrame(overall).groupby(
                'name').sum()  # create separate sum dataframe.
            P = s.GotWant / s.Got
            R = s.GotWant / s.Want
            df['evalb_corpus'] = 2 * P * R / (P + R)
            df['evalb_avg'] = df.pop('evalb')  # get rid of old column.

        foo(df)

        print df[show_cols]

        if args.pareto:
            accuracy_name = 'evalb'
            with axman('speed-accuracy ($\delta= %g$)' % delta) as ax:
                df = DataFrame(overall).groupby('name').mean()
                runtime = df.wallclock / df.wallclock.max()
                for name, x, y in zip(df.index, runtime, df[accuracy_name]):
                    c = color[name]
                    ax.scatter([x], [y],
                               alpha=0.75,
                               lw=0,
                               s=50,
                               c=c,
                               label=name,
                               marker=marker[name])
                ax.legend(loc=4)
                ax.set_xlim(-0.1, 1.1)
                ax.set_ylim(0, 1)
                ax.grid(True)
                ax.set_xlabel('runtime (relative to slowest)')
                ax.set_ylabel('accuracy (%s)' % accuracy_name)
                show_frontier(runtime, df[accuracy_name], ax=ax)

        if args.bylength:
            # Breakdown runtime differences of parsers by length.
            bylength = {name: [] for name in T}
            for length, df in DataFrame(overall).groupby('N'):
                df = df.groupby('name').mean()
                for name, v in df.wallclock.iteritems():
                    bylength[name].append([length, v])
            with axman('benchmark') as ax:
                for name, d in sorted(bylength.items()):
                    d.sort()
                    xs, ys = np.array(d).T
                    ax.plot(xs, ys, alpha=0.5, c=color[name], label=name)
                    ax.scatter(xs, ys, alpha=0.5, lw=1, c=color[name])
                ax.legend(loc=2)
                ax.set_xlabel('sentence length')
                ax.set_ylabel('seconds / sentence')

        if check_llh:
            # Only run this test when it makes sense, e.g., when all parses come
            # from the same grammar.
            s0 = data[0]['llh']
            for x in data:
                s = x['llh']
                name = x['name']
                if abs(s0 - s) > 1e-10:
                    errors.append({'parser': name, 'sentence': sentence})
                    print '[%s]: name: %s expect: %g got: %g' % (red % 'error',
                                                                 name, s0, s)

        Timer.compare_many(*T.values(), verbose=False)

        if errors:
            print red % 'errors: %s' % len(errors)

    print
    print green % '==============================='
    print green % 'DONE!'
    print

    print 'EVALB-unofficial:'
    print 2 * (df.GotWant / df.Got * df.GotWant /
               df.Want) / (df.GotWant / df.Got + df.GotWant / df.Want)
    print
    print 'EVALB-official:'
    import os
    for k, p in enumerate(P):
        _evalb_pred[p].close()
        _evalb_gold[p].close()
        out = 'tmp/evalb-%s.out' % k
        os.system('./bin/EVALB/evalb %s %s > %s' %
                  (_evalb_gold[p].name, _evalb_pred[p].name, out))
        with file(out) as f:
            for x in f:
                if x.startswith('Bracketing FMeasure'):
                    print p, float(x.strip().split()[-1])
                    break  # use the first one which is for all lengths
Example #23
0
"""
source from: https://pypi.org/project/PyStanfordDependencies/
			https://stackoverflow.com/questions/13883277/stanford-parser-and-nltk


"""
import StanfordDependencies, os.path, sys
from nltk.parse.stanford import StanfordParser
parser = StanfordParser(
)  #be sure to have set environmental path to englishPCFG.ser.gz
sd = StanfordDependencies.get_instance(backend='subprocess')


def getTypeD(input):
    'returns our the string with the dependency tags'
    sS = ""
    myList = list(parser.raw_parse(input))

    for l in myList:
        sS += str(l)

    return sS


def createDepData(tag_sent):
    'method from the PyStanfordDependencies 0.3.1 package'
    data = sd.convert_tree(tag_sent)

    return data

Example #24
0
from collections import defaultdict
from glob import glob

import numpy as np
import StanfordDependencies
from docopt import docopt
from joblib import Parallel, delayed
from pycorenlp import StanfordCoreNLP
from tqdm import tqdm

logger = logging.getLogger(__name__)

class_paths = os.environ['CLASSPATH'].split(':')
corenlp_path = list(
    filter(lambda p: p.endswith('stanford-corenlp-3.9.2.jar'), class_paths))[0]
sd = StanfordDependencies.get_instance(jar_filename=corenlp_path,
                                       backend='jpype')

noun_tags = ['NNP', 'NP', 'NNS', 'NN', 'N', 'NE']
subject_tags = ['csubj', 'csubjpass', 'subj', 'nsubj', 'nsubjpass']
object_tags = ['pobj', 'dobj', 'iobj']

idx2role = ['-', 'X', 'O', 'S']
role2idx = {role: idx for idx, role in enumerate(idx2role)}


def get_deps(sentences):
    entities = defaultdict(lambda: defaultdict(dict))
    parse_trees = []
    for sentence in sentences:
        parse_trees.append(sentence['parse'])
Example #25
0
import StanfordDependencies
import simplejson
import sys

dp = StanfordDependencies.get_instance(backend='subprocess')


def to_nltk_tree(sent):
    """Transforms a sentence to an NLTK tree.
    :param sent: (str) a bracketed parse
    :return: (nltk.Tree) an NLTK ``Tree'' instance
    """
    import nltk
    return nltk.Tree.fromstring(sent)


def to_cnf(sent):
    """Transforms a parsed sentence to a parsed sentence in CNF.
    :param sent: (str) a bracketed parse
    :return: (str) a bracketed parse in Chomsky Normal Form
    """
    import nltk
    tree = nltk.Tree.fromstring(sent)
    tree.chomsky_normal_form()
    return str(tree)


def to_deps(sent):
    """Transforms a parsed sentence to raw universal dependencies.
    :param sent: (str) a bracketed parse
    :return: (list[Token]) a list of Tokens representing a dependency tree
Example #26
0
def main():
    """
    Imports a data set from the Penn Treebank.
    Splits the data set into training and test set.
    Calls the functions to learn a grammar and to perform parsing and evaluation.
    """
    fileids = treebank.fileids()
    # This file is removed for being no complete sentence:
    fileids.remove('wsj_0056.mrg')
    sd = StanfordDependencies.get_instance(backend='subprocess')

    # Toy sentence, not extracted from the Penn Treebank. No gold standard is
    # available for this sentence, therefore evaluation must be done manually.
    # In order to allow a better visual representation of the chart in the attached
    # report, the period at the end of the sentence is omitted.
    simple_sent = 'The cat sleeps in the garden'

    os.system('clear')

    while True:
        answer_1 = input('\nPlease select one of the following actions (1, 2):\n\n' \
                         '1 - Use the same splitting in training and test set that was used ' \
                         'for the evaluation illustrated in the report. If you choose this ' \
                         'option, the parser will use an already existing grammar.\n\n'
                         '2 - Randomly split the data set into training and test set and ' \
                         'perform a new evaluation. If you choose this option, the parser will ' \
                         'have to learn a new grammar based on the training set. ' \
                         'This might take a few minutes.\n')
        if answer_1 in ['1', '2']:
            break

    if answer_1 == '1':
        test_set = ['wsj_0004.mrg', 'wsj_0008.mrg', 'wsj_0014.mrg', 'wsj_0050.mrg', \
                    'wsj_0063.mrg', 'wsj_0065.mrg', 'wsj_0070.mrg', 'wsj_0073.mrg', \
                    'wsj_0089.mrg', 'wsj_0096.mrg', 'wsj_0099.mrg', 'wsj_0118.mrg', \
                    'wsj_0120.mrg', 'wsj_0137.mrg', 'wsj_0144.mrg', 'wsj_0165.mrg', \
                    'wsj_0171.mrg', 'wsj_0181.mrg', 'wsj_0182.mrg', 'wsj_0199.mrg']

        training_set = [fileid for fileid in fileids if fileid not in test_set]
        print_test_training_set(test_set, training_set)

        with open('grammar_reduced.json', 'rb') as data_file:
            grammar = json.load(data_file)

    elif answer_1 == '2':
        test_set, training_set = split_data_set(fileids)
        print_test_training_set(test_set, training_set)
        while True:
            answer_2 = input(
                "\nPlease press 't' to proceed with the training.\n")
            if answer_2 == 't':
                break

        tic = time.time()
        # Call function to learn a grammar from the Penn Treebank
        grammar = learn_grammar(training_set, sd)
        toc = time.time() - tic
        print('\nIt took {0:.4f} seconds to learn a grammar'.format(toc))

    while True:
        answer_3 = input("\nPlease press 'p' to proceed with the parsing.\n")
        if answer_3 == 'p':
            break
    valid = []
    test_sentences = []
    for i, fileid in enumerate(test_set):
        # Call function to convert the conll format into a string
        input_sent_test = get_sentence(fileid, sd)
        test_sentences.append(input_sent_test)
        print()
        print('Converting sentence n. {} into string form...'.format(i + 1))
        valid.append(str(i + 1))

    quit_program = 'no'
    while True:
        print('\n\nThis is a list of the sentences in the test set:\n')
        for i, (sentence, fileid) in enumerate(zip(test_sentences, test_set)):
            print()
            print(
                str(i + 1) + ') id: ' + fileid + '\tSentence length: ' +
                str(len(sentence.split())))
            print(sentence)
            print()
        print('-' * 40)
        print()
        while True:
            to_parse = input('\nPlease select the number of the sentence to be parsed.\n' \
                             "Press 't' to parse the toy sentence that is described in the report.\n" \
                             "Press 's' to type your own sentence.\n" \
                             "Press 'q' to quit.\n")
            if to_parse in valid or to_parse in ['t', 's']:
                break
            elif to_parse == 'q':
                quit_program = 'yes'
                break

        if quit_program == 'yes':
            break
        if to_parse == 't':
            test_sent = simple_sent
        elif to_parse == 's':
            test_sent = input(
                'Please type the sentence you would like to parse.\n')
        else:
            test_sent = test_sentences[int(to_parse) - 1]
        # Call function to perform the parsing of the selected sentence
        dict_parse_final = parse_sentence(grammar, test_sent)
        if dict_parse_final == 0:
            continue
        if to_parse in ['t', 's']:
            print('\nNo gold standard available for the present sentence\n')
            while True:
                answer_9 = input(
                    "\nPlease press 's' to parse another sentence.\n")
                if answer_9 == 's':
                    break
        else:
            print()
            print('*' * 50)
            while True:
                answer_4 = input(
                    "\nPlease press 'e' to proceed with the evaluation.\n")
                if answer_4 == 'e':
                    ref_sentence = gold_standard(test_set[int(to_parse) - 1],
                                                 sd)
                    # Call function to perform the evaluation
                    evaluation(ref_sentence, dict_parse_final)
                    break
Example #27
0
 def __init__(self):
     self.rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True)
     self.sd = StanfordDependencies.get_instance(backend='subprocess')
Example #28
0
                    action='store_false',
                    help="Don't use Universal Dependencies (if available)")
parser.add_argument('-V', '--corenlp-version', dest='version',
                    metavar='VERSION',
                    help="Version of CoreNLP (will use default if not set)")
parser.add_argument('-d', '--debug', action='store_true',
                    help="Enable debugging (subprocess only)")

args = parser.parse_args()
if args.debug:
    print('Args:', args)

conversion_args = dict(representation=args.representation,
                       universal=args.universal)
if args.debug:
    if args.backend == 'subprocess':
        conversion_args['debug'] = True
    else:
        print("Warning: Can only set debug flag in subprocess backend.",
              file=sys.stderr)

sd = StanfordDependencies.get_instance(backend=args.backend,
                                       version=args.version)
if not args.filenames: # interactive mode
    print("Ready to read and convert trees (one per line)")
for tree in fileinput.input(args.filenames):
    print('Tree: %r' % tree)
    tokens = sd.convert_tree(tree, **conversion_args)
    for token in tokens:
        print(token)
Example #29
0
def head_related(query, candidate):
    lmt = WordNetLemmatizer()

    sd = StanfordDependencies.get_instance(backend='subprocess')
    a = Annotator()
    synTree = a.getAnnotations(query)['syntax_tree']

    tokens = sd.convert_tree(synTree)
    queue = []
    for i, token in enumerate(tokens):
        if token[6] == 0:
            queue.append((i + 1, token))

    qHeadWords = []
    while queue != []:
        s = queue[0]
        queue.remove(s)
        flag = 0
        #print s[1][1], s[0]
        for i, word in enumerate(tokens):
            if word[6] == s[0]:
                flag = 1
                queue.append((i + 1, word))
        if flag == 1:
            qHeadWords.append(lmt.lemmatize(s[1][1], 'v'))

    synTree = a.getAnnotations(candidate)['syntax_tree']

    tokens = sd.convert_tree(synTree)
    queue = []
    for i, token in enumerate(tokens):
        if token[6] == 0:
            queue.append((i + 1, token))

    cHeadWords = []
    while queue != []:
        s = queue[0]
        queue.remove(s)
        flag = 0
        #print s[1][1], s[0]
        for i, word in enumerate(tokens):
            if word[6] == s[0]:
                flag = 1
                queue.append((i + 1, word))
        if flag == 1:
            cHeadWords.append(lmt.lemmatize(s[1][1], 'v'))

    queryRel = []
    for word in qHeadWords:
        for i, j in enumerate(wn.synsets(word)):
            for l in j.lemmas():
                queryRel.append(l.name())
            #queryRel.append(l.lemma_names() for l in j.hypernyms())
            for l in j.hypernyms():
                for k in l.lemma_names():
                    queryRel.append(k)
            for l in j.hyponyms():
                for k in l.lemma_names():
                    queryRel.append(k)

    candidateRel = []
    for word in cHeadWords:
        for i, j in enumerate(wn.synsets(word)):
            for l in j.lemmas():
                candidateRel.append(l.name())
            #queryRel.append(l.lemma_names() for l in j.hypernyms())
            for l in j.hypernyms():
                for k in l.lemma_names():
                    candidateRel.append(k)
            for l in j.hyponyms():
                for k in l.lemma_names():
                    candidateRel.append(k)

    exactHeadScore = 0
    count = 0
    for j in cHeadWords:
        count = count + 1
        for i in qHeadWords:
            #print i,j
            if i == j:
                exactHeadScore = exactHeadScore + 1
    try:
        exactHeadScore = exactHeadScore / count
    except:
        exactHeadScore = 0
    #print "Exact Head Score\n"

    relHeadScore = 0
    count = 0
    for j in candidateRel:
        count = count + 1
        if j in queryRel:
            relHeadScore = relHeadScore + 1

    try:
        relHeadScore = relHeadScore / count
    except:
        relHeadScore = 0
    #print "Relative Head Score\n"
    return relHeadScore, exactHeadScore
                    dest='version',
                    metavar='VERSION',
                    help="Version of CoreNLP (will use default if not set)")
parser.add_argument('-d',
                    '--debug',
                    action='store_true',
                    help="Enable debugging (subprocess only)")

args = parser.parse_args()
if args.debug:
    print('Args:', args)

conversion_args = dict(representation=args.representation,
                       universal=args.universal)
if args.debug:
    if args.backend == 'subprocess':
        conversion_args['debug'] = True
    else:
        print("Warning: Can only set debug flag in subprocess backend.",
              file=sys.stderr)

sd = StanfordDependencies.get_instance(backend=args.backend,
                                       version=args.version)
if not args.filenames:  # interactive mode
    print("Ready to read and convert trees (one per line)")
for tree in fileinput.input(args.filenames):
    print('Tree: %r' % tree)
    tokens = sd.convert_tree(tree, **conversion_args)
    for token in tokens:
        print(token)
#feature prepare
# f19; fill entail set
stop = stopwords.words('english')
# entailLst = entailfeaturePrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/reverb_local_global/Resource0812/reverb_local_clsf_all.txt')
HedgeLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/hedges_hyland2005.txt','r','utf-8') if ('#' not in line)])
# FactiveLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/factives_hooper1975.txt','r','utf-8') if ('#' not in line)])
# AssertiveLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/assertives_hooper1975.txt','r','utf-8') if ('#' not in line)])
# ImplicativeLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/implicatives_karttunen1971.txt','r','utf-8') if ('#' not in line)])
# ReportLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/bias_related_lexicons/report_verbs.txt','r','utf-8') if ('#' not in line)])
# StrongSubjLst = subjectivePrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff','strongsubj')
# WeakSubjLst = subjectivePrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff','weaksubj')
# PolarityDict = polarityPrepare('/Volumes/Seagate Backup Plus Drive/npov_paper_data/subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff')
# PosiveLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/opinion-lexicon-English/positive-words.txt','r','utf-8') if (';' not in line)])
# NegativeLst = filter(None,[ line.rstrip() for line in codecs.open('/Volumes/Seagate Backup Plus Drive/npov_paper_data/opinion-lexicon-English/negative-words.txt','r') if (';' not in line)])
sd = StanfordDependencies.get_instance(backend="subprocess")
os.environ['STANFORD_PARSER'] = '/Users/wxbks/Downloads/stanford-parser-full-2014-08-27/'
os.environ['STANFORD_MODELS'] = '/Users/wxbks/Downloads/stanford-parser-full-2014-08-27/'
parser = stanford.StanfordParser(model_path="/Users/wxbks/Downloads/stanford-parser-full-2014-08-27/stanford-parser-3.4.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")


npovdict = {}
labels = []
features = []
line_num = 0
# npovlist = []
start = timeit.timeit()
for line in gram5_train:
    line = line.decode('utf8')
    line = line.rstrip('\n')
    nline = line.split('\t')
Example #32
0
 def __init__(self, CACHE):
     Cached.__init__(self, CACHE)
     self.sd = StanfordDependencies.get_instance(jar_filename=STANFORD_JAR,
                                                 backend='jpype')
Example #33
0
st = StanfordNERTagger(
    '../StanfordNLP/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
    '../StanfordNLP/stanford-ner-2018-10-16/stanford-ner.jar',
    encoding='utf-8')

#for deprel
os.environ[
    'STANFORD_PARSER'] = '../StanfordNLP/stanford-parser-full-2018-10-17/stanford-parser.jar'
os.environ[
    'STANFORD_MODELS'] = '../StanfordNLP/stanford-parser-full-2018-10-17/stanford-parser-3.9.2-models.jar'
parser = stanford.StanfordParser(
    model_path=
    "../StanfordNLP/stanford-parser-full-2018-10-17/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
)
sd = StanfordDependencies.get_instance(
    jar_filename=
    '../StanfordNLP/stanford-parser-full-2018-10-17/stanford-parser.jar')

#for read_data
train_path = './SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT'
test_path = './SemEval2010_task8_all_data/SemEval2010_task8_testing_keys/TEST_FILE_FULL.TXT'
#for write_json
train_json = './ToTacredResult/train.json'
test_json = './ToTacredResult/test.json'


def most_common(words):
    user_counter = Counter(words)
    if len(user_counter.most_common(len(words))) == 1:
        return user_counter.most_common(1), False
    else: