Ejemplo n.º 1
0
def get_pos_tags(content, stopwords, is_stemming, is_math):
  # Content should be tokenized
  pos_tagger_dir = '/usr/users/swli/program/nlp_util/stanford-postagger'
  model = pos_tagger_dir + '/models/wsj-0-18-bidirectional-distsim.tagger'
  classpath = pos_tagger_dir + '/stanford-postagger_with_slf4j.jar'
  tagger = StanfordPOSTagger(model, classpath, java_options='-mx4000m')
  try:
    tag_results = tagger.tag(re.split('\s+', content))
  except OSError:
    sentences = re.split('\s+\.\s+', content)
    tag_results = []
    for index in range(len(sentences)):
      sentence = sentences[index]
      if index < len(sentences)-1:
        sentence += ' .'
      tag_results += get_contaminated_tag_results(sentence, tagger)
 
  pos_tags = []
  for pair in tag_results:
    word = pair[0]
    # map simple equation to tokens
    if is_math:
      word = simple_eq_to_text(word)
    # remove punctuation
    word = "".join(l for l in word if l not in string.punctuation)
    word = word.lower()
    word = process_word(word, stopwords, is_stemming, is_math)
    if word:
      pos_tags.append(pair[1])
  return pos_tags
Ejemplo n.º 2
0
def genere_liste_natures(l_auteurs, STANFORD_PARSER='../stanford', STANFORD_MODELS='../stanford', JAVAHOME='/import/lhauseux/jre1.8.0_45/bin', bid = '/import/lhauseux/Pyzo/pyzo2015a/stanford-postagger-2015-04-20/models/english-bidirectional-distsim.tagger', pt = '/import/lhauseux/Pyzo/pyzo2015a/stanford-postagger-2015-04-20/stanford-postagger.jar'):
    # On règle les paramètres du prog' de Stangord et de java :
    os.environ['STANFORD_PARSER'] = STANFORD_PARSER
    os.environ['STANFORD_MODELS'] = STANFORD_MODELS
    os.environ['JAVAHOME'] = JAVAHOME
    
    st = StanfordPOSTagger(bid,path_to_jar=pt,java_options='-mx15000m') 
    nltk.internals.config_java(options='-xmx2G')

    
    # On crée le dossier forets si nécessaire
    if not os.path.isdir('./listes_natures'):
        os.mkdir('./listes_natures')
    for auteur in l_auteurs:
        # On crée le dossier propre à l'auteur si nécessaire
        if not os.path.isdir('./listes_natures/'+auteur):
            os.mkdir('./listes_natures/'+auteur)
        articles = os.listdir('./auteurs/'+auteur)
        for article in articles:
            if article != 'liens.txt':
                # On récupère l'article au format texte
                f = open('./auteurs/'+auteur+'/'+article,'r')
                contenu = f.read()
                f.close()
                # On le transforme en forêts d'arbres
                contenu = nltk.word_tokenize(contenu)
                contenu = st.tag(contenu)
                contenu = [c[1] for c in contenu]
                # On enregistre
                f = open('./listes_natures/'+auteur+'/'+article,'wb')
                pickle.dump(contenu,f)
                f.close()
                print(auteur,article)
Ejemplo n.º 3
0
def postag_sents(sents):
    if not os.environ.get('STANFORD_MODELS'):
        os.environ["STANFORD_MODELS"] = STANFORD_MODELS
        
    st = StanfordPOSTagger('arabic.tagger', STANFORD_POSTAGGER + '/combined.jar')
    tagged_sents = st.tag_sents(sents)
    tagged_sents = [[tuple(t[1].split('/'))  for t in sent] for sent in tagged_sents]
    
    return tagged_sents
Ejemplo n.º 4
0
def postagger():
	os.environ['STANFORD_POSTAGGER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27'
	os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27/stanford-postagger.jar'
	os.environ['STANFORD_POSTAGGER'] = os.environ['CLASSPATH']

	eng_tagger = StanfordPOSTagger('/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27/models/english-bidirectional-distsim.tagger')

	for x in content:
		print(eng_tagger.tag(x.split()))
def read_sstb_data(fpath='sstb/sstb_condensed_{}.csv'):
    revs = []
    vocab = {}
    pos_vocab = {}
    max_len = 0
    pos_tagger = StanfordPOSTagger(
        'pos-tag/english-left3words-distsim.tagger',
        'pos-tag/stanford-postagger.jar',
        'utf8', False, '-mx2000m')

    dataset_split = ['train', 'test', 'dev']

    for split in dataset_split:
        with open(fpath.format(split), "rb") as f:
            rdr = csv.reader(f)
            tokens_list = []
            labels = []

            # read all the lines
            for row in rdr:
                tokens = clean_str(row[0]).split()
                tokens_list.append(tokens)
                labels.append(row[1])

            # pos tagging
            tokens_list_tagged = pos_tagger.tag_sents(tokens_list)

            for i in range(len(tokens_list_tagged)):
                tokens_tagged = tokens_list_tagged[i]
                label = labels[i]
                text_tokens = list(zip(*tokens_tagged)[0])
                tag_tokens = list(zip(*tokens_tagged)[1])

                # add each token to vocab
                for token in text_tokens:
                    if token not in vocab:
                        vocab[token] = len(vocab)
                for tag in tag_tokens:
                    if tag not in pos_vocab:
                        pos_vocab[tag] = len(pos_vocab)

                # get max len
                max_len = max(max_len, len(text_tokens))

                # create an entry for the current rev and add to the list
                curr_rev = {'text_tokens': text_tokens,
                            'tag_tokens': tag_tokens,
                            'label': conv_label_to_label_vec(label),
                            'fold_num': get_fold_num(split)}
                revs.append(curr_rev)

    # add padding word
    vocab[PAD_WORD] = len(vocab)
    pos_vocab[PAD_WORD] = len(pos_vocab)

    return revs, vocab, pos_vocab, max_len
Ejemplo n.º 6
0
def read_mr_data(num_folds, fpath='mr/rt-polarity.{}'):
    revs = []
    vocab = {}
    pos_vocab = {}
    max_len = 0
    pos_tagger = StanfordPOSTagger(
        'pos-tag/english-left3words-distsim.tagger',
        'pos-tag/stanford-postagger.jar',
        'utf8', False, '-mx2000m')
    sentiments = ['pos', 'neg']

    for sentiment in sentiments:
        with open(fpath.format(sentiment), "rb") as f:
            tokens_list = []
            label_vec = conv_sent_to_vec(sentiment)

            # read all the lines
            for line in f.read().splitlines():
                tokens = clean_str(line).split()
                tokens_list.append(tokens)

            # pos tagging
            tokens_list_tagged = pos_tagger.tag_sents(tokens_list)

            for tokens_tagged in tokens_list_tagged:
                text_tokens = list(zip(*tokens_tagged)[0])
                tag_tokens = list(zip(*tokens_tagged)[1])

                # add each token to vocab
                for token in text_tokens:
                    if token not in vocab:
                        vocab[token] = len(vocab)
                for tag in tag_tokens:
                    if tag not in pos_vocab:
                        pos_vocab[tag] = len(pos_vocab)

                # get max len
                max_len = max(max_len, len(text_tokens))

                # create an entry for the current rev and add to the list
                curr_rev = {'text_tokens': text_tokens,
                            'tag_tokens': tag_tokens,
                            'label': label_vec,
                            'fold_num': np.random.randint(0, num_folds)}
                revs.append(curr_rev)

    # add padding word
    vocab[PAD_WORD] = len(vocab)
    pos_vocab[PAD_WORD] = len(pos_vocab)

    return revs, vocab, pos_vocab, max_len
def pos_tagging(sentence):

    english_postagger=StanfordPOSTagger('stanford-postagger-2014-08-27/models/english-bidirectional-distsim.tagger','stanford-postagger-2014-08-27/stanford-postagger.jar')

    VP_list=[]

    POS_list=english_postagger.tag(sentence.split())

    '''for i in range(0, len(POS_list)):
        if POS_list[i][1] in ['NNS','NNP','NNPS']:
            NP_list.append(POS_list[i][0])'''


    return POS_list
def get_pos_tag(sen):#pass sentence dataframe
    st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar=
                           '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models')

    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    st._stanford_jar = ':'.join(stanford_jars)
    for i in list(sen.index.get_values()):
        t=st.tag(sen.loc[i,'Arg'].split())
        tags=[]
        for j in range(0,len(t)):
            tags.append(t[j][1])
        #print i
        sen.set_value(i,'POStag',tags)
    return sen
Ejemplo n.º 9
0
class Lemmatizer(AbstractStemmer):

    def __init__(self, ):
        super(Lemmatizer, self).__init__()
        self.basename = 'lemmatized'
        self.pos_tagger = StanfordPOSTagger('english-left3words-distsim.tagger', java_options='-mx1024m')
        self.lemmatizer = WordNetLemmatizer()
        self.max_length = 500

    def process(self, words):
        current_sentence = []
        pos_words = []
        for word in words:
            current_sentence.append(word)
            if word in '.!?' and len(current_sentence) > self.max_length:
                try:
                    pos_words += self.pos_tagger.tag(current_sentence)
                except Exception:
                    print 'Broke on', current_sentence
                    raise
                current_sentence = []
        for i in range(len(current_sentence) / self.max_length):
            try:
                pos_words += self.pos_tagger.tag(current_sentence[:self.max_length])
            except Exception:
                print 'Broke on', current_sentence[:self.max_length]
                raise
            current_sentence = current_sentence[self.max_length:]
        try:
            pos_words += self.pos_tagger.tag(current_sentence)
        except Exception:
            print 'Broke on', current_sentence
            raise
        processed_words = [self.lemmatizer.lemmatize(wd, pos=self.get_wn_pos(ps)) for wd, ps in pos_words]
        return processed_words

    # from http://stackoverflow.com/questions/15586721
    def get_wn_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
Ejemplo n.º 10
0
 def __init__(self, tagger_path, model_path, output_filename):
     self.st = StanfordPOSTagger(tagger_path, model_path)
     self.output_filename = output_filename
     try:
         os.remove(self.output_filename)
     except OSError:
         pass
def pos_person_tagging(sentence):

    #Setting the path and jar files for the POS Tagger

    english_postagger=StanfordPOSTagger('stanford-postagger-2014-08-27/models/english-bidirectional-distsim.tagger','stanford-postagger-2014-08-27/stanford-postagger.jar')

    NP_list=[]

    POS_list=english_postagger.tag(sentence.split())

    for i in range(0, len(POS_list)):
        if POS_list[i][1] in ['NNS','NNP','NNPS']:
            NP_list.append(POS_list[i][0])


    return NP_list
Ejemplo n.º 12
0
def main():

    initialize()
    # create tagger
    model = '../stanford-postagger/models/chinese-distsim.tagger'
    jar = '../stanford-postagger/stanford-postagger.jar'
    zhPOS = StanfordPOSTagger(model, jar)

    # streaming model: process each line in turn
    with io.open(INFILE, 'r', encoding='utf8') as qts, io.open(OUTFILE, 'w', encoding='utf8') as pos:

        for line in qts:
            qtsPOS = zhPOS.tag(line)
            s = " ".join("%s" % tup[1] for tup in qtsPOS) + "\n"
            pos.write(s)

    return()
def get_pos_tag(sen):
    os.environ['CLASSPATH']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/stanford-postagger.jar' #set classpath to pos tagger
    os.environ['STANFORD_MODELS']='STANFORDTOOLSDIR/stanford-postagger-full-2015-12-09/models'
    st = StanfordPOSTagger('/home/sadhana/stanford-postagger-full-2015-12-09/models/english-left3words-distsim.tagger',path_to_jar=
                           '/home/sadhana/stanford-postagger-full-2015-12-09/stanford-postagger.jar')#,path_to_models_jar='/home/sadhana/stanford-postagger-full-2015-12-09/models')

    stanford_dir = st._stanford_jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    st._stanford_jar = ':'.join(stanford_jars)
    for i in list(sen.index.get_values()):
        t=st.tag(sen.loc[i,'Arg'].split())
        tags=[]
        for j in range(0,len(t)):
            tags.append(t[j][1])
        #print i
        sen.set_value(i,'POStag',tags)
    return sen
Ejemplo n.º 14
0
class FeatureProcessing(object):
  def __init__(self):
    self.feat_index = {}
    self.implication_words = ["demonstrate", "suggest", "indicate"]
    self.hyp_words = ["possible"]
    self.method_words = ["probe", "detect"]
    self.pos_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')

  def get_features(self, phrase, filter_feature='0'):
    words = word_tokenize(phrase)
    pos_tags = self.pos_tagger.tag(words)
    features = []
    for word, tag in pos_tags:
      wl = word.lower()
      # Feat 1: POS features
      if filter_feature != '1':
        if tag != ',' and tag != '.':
          features.append(tag)
      # Feat 2: Verb and adverb identity
      if filter_feature != '2':
        if tag == 'RB' or tag.startswith('VB'):
          features.append(wl)
      # Feat 3: Presence of figure references and citations
      if filter_feature != '3':
        if word.startswith("Fig"):
          features.append("figure")
        if re.search("[A-Z][^\s]+ et al.", phrase) is not None:
          features.append("reference")
    # Feat 4: Presence of specific words or phrases
    if filter_feature != '4':
      if re.search("[Dd]ata not shown", phrase) is not None:
        features.append("data_not_shown")
      for word in self.implication_words:
        if word in phrase:
          features.append("implication_word")
      for word in self.hyp_words:
        if word in phrase:
          features.append("hyp_word")
      for word in self.method_words:
        if word in phrase:
          features.append("method_word")
    return features

  def index_data(self, data, filter_feature='0'):
    all_features = [self.get_features(datum, filter_feature) for datum in data]
    for features in all_features:
      for feat in features:
        if feat not in self.feat_index:
          self.feat_index[feat] = len(self.feat_index)

  def featurize(self, phrase, filter_feature='0'):
    indexed_features = [0] * len(self.feat_index)
    features = self.get_features(phrase, filter_feature)
    for feat in features:
      if feat not in self.feat_index:
        continue
      indexed_features[self.feat_index[feat]] += 1
    return indexed_features
Ejemplo n.º 15
0
def build_data_cv(data_file):
    revs = []
    vocab = defaultdict(float)
    pos_vocab = defaultdict(float)
    pos_tagger = StanfordPOSTagger(
        'pos-tag/english-left3words-distsim.tagger',
        'pos-tag/stanford-postagger.jar',
        'utf8', False, '-mx2000m')
    split_list = ['train', 'test']
    class_to_label = {}

    for split in split_list:
        with open(data_file.format(split), "rb") as f:
            revs_text = []
            ys = []
            for line in f:
                qclass, rev = line.split(':')[0], line.split(':')[1]
                rev = clean_str(rev)
                if qclass not in class_to_label:
                    class_to_label[qclass] = len(class_to_label)
                    y = class_to_label[qclass]
                else:
                    y = class_to_label[qclass]
                revs_text.append(rev.split())
                ys.append(y)
            revs_tagged = pos_tagger.tag_sents(revs_text)
            for i in range(len(revs_tagged)):
                rev_tagged = revs_tagged[i]
                text = list(zip(*rev_tagged)[0])[1:]
                tag = list(zip(*rev_tagged)[1])
                y = ys[i]
                for word in set(text):
                    vocab[word] += 1
                for postag in set(tag):
                    pos_vocab[postag] += 1
                datum = {"y": y,
                         "text": ' '.join(text),
                         "tag": ' '.join(tag),
                         "num_words": len(text),
                         "split": 0 if split == 'train' else 1}
                revs.append(datum)

    return revs, vocab, pos_vocab, len(class_to_label)
Ejemplo n.º 16
0
def build_data_cv(data_file):
    revs = []
    vocab = defaultdict(float)
    pos_vocab = defaultdict(float)
    pos_tagger = StanfordPOSTagger(
        'pos-tag/english-left3words-distsim.tagger',
        'pos-tag/stanford-postagger.jar',
        'utf8', False, '-mx6000m')
    splits = ['train', 'test', 'dev']

    for split in splits:
        with open(data_file.format(split), "rb") as f:
            lines = f.read().splitlines()
            revs_text = []
            ratings = []
            for line in lines:
                line_split = line.split('\t\t')
                rating = int(line_split[2]) - 1
                rev = line_split[3]
                rev_tokens = rev.split()
                revs_text.append(rev_tokens)
                ratings.append(rating)

            revs_tagged = pos_tagger.tag_sents(revs_text)
            for i in range(len(revs_tagged)):
                rev_tagged = revs_tagged[i]
                text = list(zip(*rev_tagged)[0])
                tag = list(zip(*rev_tagged)[1])
                for word in set(text):
                    vocab[word] += 1
                for postag in set(tag):
                    pos_vocab[postag] += 1
                rev_datum = {"y": ratings[i],
                             "text": ' '.join(text),
                             "tag": ' '.join(tag),
                             "num_words": len(text),
                             "split": get_split_num(split)}
                revs.append(rev_datum)

    return revs, vocab, pos_vocab
Ejemplo n.º 17
0
class POSTagger:

    def __init__(self, tagger_path, model_path, output_filename):
        self.st = StanfordPOSTagger(tagger_path, model_path)
        self.output_filename = output_filename
        try:
            os.remove(self.output_filename)
        except OSError:
            pass

    def output_knowledge(self, sentence):
        sentence += " ."
        s = ""
        with open(self.output_filename, "a") as file:
            for word, pos_tag in self.st.tag(sentence.split()):
                file.write(("%s\t%s\n" % (word, pos_tag)).encode("utf-8"))
            file.write("\n")
Ejemplo n.º 18
0
# from nltk.tag import StanfordNERTagger
# eng_tagger = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
# print eng_tagger.tag('Rami Eid is studying at Stony Brook University in NY'.split())
#
# # 中文命名实体识别
# chi_tagger = StanfordNERTagger('chinese.misc.distsim.crf.ser.gz')
# sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'
# for word, tag in chi_tagger.tag(sent.split()):
#     print word.encode('utf-8'), tag
#
# # 英文词性标注
from nltk.tag import StanfordPOSTagger
# eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')
# print eng_tagger.tag('What is the airspeed of an unladen swallow ?'.split())
# # 中文词性标注
chi_tagger = StanfordPOSTagger('chinese-distsim.tagger')
# sent = u'北海 已 成为 中国 对外开放 中 升起 的 一 颗 明星'
sent = u'宫体 子宫 呈 垂直位 宫内膜 高 T2 信号 连续'
for _, word_and_tag in chi_tagger.tag(sent.split()):
    word, tag = word_and_tag.split('#')
    print word.encode('utf-8'), tag


# 中英文句法分析 区别在于词库不同
from nltk.parse.stanford import StanfordParser
eng_parser = StanfordParser(model_path='edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz')
sent = list(u'子宫 呈 垂直位 , 宫内膜 高 T2 信号 连续'.split())
for tree in eng_parser.parse(sent):
    tree.pprint()

Ejemplo n.º 19
0
# -*- coding:utf-8 -*-
import os
import json
import re
import nltk
from flask import Flask, render_template, request  # importation des class de flask

from nltk.tag import StanfordPOSTagger

app = Flask(__name__)
os.system("clear")

root_path = "/home/e20160010106/projet teR NOUVEAU/StanfordTagguer/"
# instance de la classe StanfordPOSTagger en UTF-8
pos_tagger = StanfordPOSTagger(root_path + "/models/french.tagger",
                               root_path + "/stanford-postagger.jar",
                               encoding='utf8')


def pos_tag(sentence):
    # je transforme la phrase en tokens => si vous avez un texte avec plusieurs phrases, passez d'abord par nltk pour récupérer les phrases
    tokens = nltk.word_tokenize(sentence)
    tags = pos_tagger.tag(tokens)  # lance le tagging
    print(tags)
    return tags


@app.route('/')  # route qui a pour chemin / vers la page d'accueil
def recherche_1():
    print('/recherche_1')
    return render_template(
Ejemplo n.º 20
0
# -*- coding:utf-8 -*-

from nltk.tag import StanfordPOSTagger
jar = 'C:/Users/Shijie Xu/Documents/NLTK/stanford-postagger-full-2012-01-06/stanford-postagger-2012-01-06.jar'
model = 'C:/Users/Shijie Xu/Documents/NLTK/stanford-postagger-full-2012-01-06/models/french.tagger'

pos_tagger = StanfordPOSTagger(model, jar, encoding="utf8")
res = pos_tagger.tag(
    'Si toutes ces excuses ne suffisent pas, je veux bien dédier ce livre à l’enfant qu’a été autrefois cette grande personne.'
    .split())
"""tag_abbreviations = {
                    'A': 'adjective',
                    'Adv': 'adverb',
                    'CC': 'coordinating conjunction',
                    'Cl': 'weak clitic pronoun',
                    'CS': 'subordinating conjunction',
                    'D': 'determiner',
                    'ET': 'foreign word',
                    'I': 'interjection',
                    'NC': 'common noun',
                    'NP': 'proper noun',
                    'P': 'preposition',
                    'PREF': 'prefix',
                    'PRO': 'strong pronoun',
                    'V': 'verb',
                    'PONCT': 'punctuation mark',
                    'N': 'noun'}"""
print(res)
def Process_ZH(File):
    # Read file
    with open(File, 'r') as File:
        # print 'Opened'
        Input = File.readlines()


    try:
        for line in Input:
            # Checks if identifier is in the line
            if 'segment' in line:
                #Sets counter to be on and starts count at 0
                Annotations = []
                Annotation_Next = False
                Line = []
                Word_Count = 0

                # Switch to UTF-8 to ensure accurate counting
                Line_UTF8_Decode = line.decode('utf-8')
                Line_Split = Line_UTF8_Decode.split()

                for Split in Line_Split:
                    if 'feature' in Split:
                        Annotations.append([Split[17:-1], Word_Count, 0])
                        # print Annotations

                    elif 'state=' in Split:
                        Line_Temp = re.findall('>([^>]*)</', Split)
                        # print 'State_1'
                        if Line_Temp != []:
                            # Ensures that Line_Temp is a string
                            Line_Temp = Line_Temp[0]
                            Word_Count += 1
                            # print 'State_2'

                            # To ensure nested entities are parsed correctly
                            if Annotations[-1][2] != 0:
                                Annotation_Next = True
                                Length = range(len(Annotations))
                                for x in Length[::-1]:
                                    if Annotations[x][2] == 0 and Annotation_Next == True:
                                        Annotations[x][2] = Word_Count
                                        Annotation_Next = False
                            else:
                                Annotations[-1][2] = Word_Count
                            # print Annotations
                        elif Line_Temp == [] and '<segment' in Split[15:]:
                            pass
                        else:
                            Word_Count += 1
                            Line_Temp = Split[15:]
                            # print 'State 3'
                        if Line_Temp != []:
                            Line.append(Line_Temp)

                    elif '</segment>' in Split:
                        Seg_Split = Split.split('</segment>')
                        for x in Seg_Split:
                            if x != '':
                                Word_Count += 1
                                Line.append(x)
                            elif x == '':
                                if Annotations[-1][2] != 0:
                                # print 'Seg 2'
                                    Annotation_Next = True
                                    Length = range(len(Annotations))
                                    for x in Length[::-1]:
                                    # print Annotations[x][2]
                                        if Annotations[x][2] == 0 and Annotation_Next == True:
                                        # print 'Seg 3'
                                            Annotations[x][2] = Word_Count
                                            Annotation_Next = False
                                else:
                                    Annotations[-1][2] = Word_Count
                            # print Annotations

                        # if '<' not in Split[0]:
                        #     Word_Count += 1
                        #     print Split
                        #     Line_Temp = Split[:-10]
                        #     print Line_Temp
                        #     Line.append(Line_Temp)
                        #     # print 'Seg_1'
                        #     if Annotations[-1][2] != 0:
                        #         # print 'Seg 2'
                        #         Annotation_Next = True
                        #         Length = range(len(Annotations))
                        #         for x in Length[::-1]:
                        #             # print Annotations[x][2]
                        #             if Annotations[x][2] == 0 and Annotation_Next == True:
                        #                 # print 'Seg 3'
                        #                 Annotations[x][2] = Word_Count
                        #                 Annotation_Next = False
                        #     else:
                        #         Annotations[-1][2] = Word_Count
                        #     # print Annotations

                    elif '<segment' not in Split:
                        # print Split
                        Line.append(Split)
                        # Checks if Split is a punctuation character
                        if re.findall('[%s]' % zhon.hanzi.punctuation, Split) == [] and Split != ':':
                            Word_Count += 1
                Line_Done = ' '.join(Line)

                # Tags using StanfordPOSTagger

                ST = StanfordPOSTagger('~/Annotations/models/chinese-distsim.tagger', '~/Annotations/stanford-postagger.jar', encoding='utf-8')
                Tags = ST.tag(Line)
                Tags_Done = ''
                for x in Tags:
                    # print x
                    Tags_Done += x[1][-2:] + ' '

                # print Line_Done
                # print Tags_Done

                Annotations_Done = ''
                for x in Annotations:
                    Annotations_Done += str(x[1]) + ',' + str(x[2]) + ',' + str(x[1]) + ',' + str(x[2]) + ' ' + x[0].upper() + '|'
                # print Annotations_Done

                with open('Processed_Annotations.txt', 'a') as P_A:
                        P_A.write(Line_Done.encode('utf-8') + '\n')
                        P_A.write(Tags_Done + '\n')
                        P_A.write(Annotations_Done[:-1] + '\n' + '\n')
    except IndexError:
        pass
Ejemplo n.º 22
0
#将976个问题切词
import re
import xlwt
import xlrd
import jieba
from nltk.stem import WordNetLemmatizer
from nltk.tag import StanfordPOSTagger

modelfilename = 'C:\\Users\\asus\\AppData\\Roaming\\nltk_data\\stanfordpostagger\\models\\english-bidirectional-distsim.tagger'
pathtojar = 'C:\\Users\\asus\\AppData\\Roaming\\nltk_data\\stanfordpostagger\\stanford-postagger.jar'
eng_tagger = StanfordPOSTagger(model_filename=modelfilename,
                               path_to_jar=pathtojar)
lemmatizer = WordNetLemmatizer()


#读取excel中某一列的数据,形成列表
def exceltolist(path, sheet, col):
    col_values = []
    data = xlrd.open_workbook(path)
    table = data.sheet_by_name(sheet)
    col_values = table.col_values(col)
    #print(len(col_values))
    return col_values


ques = exceltolist('C:\\Users\\asus\\Desktop\\测试\\测试问题.xlsx', 'Sheet2', 0)
#print(ques)

# 读取同义词表:并生成一个字典。
combine_dict = {}
for line in open(
Ejemplo n.º 23
0
# -*- coding: utf-8 -*-
from nltk.tag import StanfordPOSTagger
import utility as util
import sys
import os
import re
import properties

reload(sys)
sys.setdefaultencoding('utf-8')
tagger = properties.tagger
taggerJARPath = properties.taggerJARPath
chunkFile = properties.chunkTestFile
st = StanfordPOSTagger(tagger,taggerJARPath)


def extractChunkTags():
        result = util.extract_All_Tags(chunkFile)
        chunktags = []
        for each in result:
            if '$' in each:
                continue
            temp1 = each.split("\t")
            temp2 = temp1[2].split()
            if len(temp2) == 1:
                temp2.append('XX')
            temp3 = []
            temp3.append(temp1[0])
            temp3.append(temp1[1])
            temp3.append(temp2[0])
            temp3.append(temp2[1])
Ejemplo n.º 24
0
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 11 13:38:34 2018

@author: Etudiant
"""

######################################################################################

from nltk.tag import StanfordPOSTagger
jar = 'C:/Users/Etudiant/Documents/Tableau de bord/stanford-postagger-full-2018-02-27/stanford-postagger-3.9.1.jar'
model = 'C:/Users/Etudiant/Documents/Tableau de bord/stanford-postagger-full-2018-02-27/models/french.tagger'
import os
java_path = "C:/Program Files/Java/jdk1.8.0_151/bin/java.exe"
os.environ['JAVAHOME'] = java_path
pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
words={}
tab2 = {}
for i in range(5):
   select=[]
   n=pos_tagger.tag(tab[i])
   stops_verb=['NC','N','NPP']
   for x in n:
       if x[1] in stops_verb:
           select.append(x[0])
           #sel = max(set(select), key=select.count)
   #tab2[i]=sel
   words={}
   for word in set(select):
       
       count = 0  
Ejemplo n.º 25
0
from opencc import OpenCC
from nltk.parse.stanford import StanfordParser
from nltk.tokenize import StanfordSegmenter
from nltk.tag import StanfordNERTagger
from nltk.tag import StanfordPOSTagger
import pickle, re, pymysql, jieba, os
import pandas as pd

chi_tagger = StanfordPOSTagger('./StanfordNLP/models/chinese-distsim.tagger',
							   './StanfordNLP/jars/stanford-ner.jar')
segmenter = StanfordSegmenter(
	java_class='edu.stanford.nlp.ie.crf.CRFClassifier',
	path_to_jar="./StanfordNLP/jars/stanford-segmenter-3.9.2.jar",
	path_to_slf4j="./StanfordNLP/jars/slf4j-api.jar",
	path_to_sihan_corpora_dict="./StanfordNLP/stanford-segmenter-2018-10-16/data",
	path_to_model="./StanfordNLP/stanford-segmenter-2018-10-16/data/pku.gz",
	path_to_dict="./StanfordNLP/stanford-segmenter-2018-10-16/data/dict-chris6.ser.gz"
)

os.environ["JAVA_HOME"] = "/tmp2/b05902109/jdk-12.0.1"#注意這邊你們電腦要安裝java jdk,並放入你們自己的jdk的安裝路徑
os.environ["CLASSPATH"] = "./StanfordNLP/stanford-parser-2018-10-17"
os.environ["STANFORD_MODELS"] = "./StanfordNLP/models"

ch_parser = StanfordParser(model_path='./StanfordNLP/models/chinesePCFG.ser.gz')
cc = OpenCC('t2s')  # (Optional )convert from Simplified Chinese to Traditional Chinese

def Get_Data_From_Mysql(source_name,keyword):
	contents_list=[]
	target_ids =[]
	db = pymysql.connect(host="18.217.252.187",port=3306, user="******",passwd="antimoneylaunderingisgood2",db="AML_News" ,charset='utf8')
	try:
Ejemplo n.º 26
0
from collections import Counter
import multiprocessing
import sys
from __future__ import print_function

stop_words_list = sw.words('english')
punctus = ':-.,?;!\'\"'
startwithupper_r = re.compile(r"[A-Z].*")
containupper_r = re.compile(r".*[A-Z].*")

java_path = r"/home/tmpuser/jre/bin/java"
os.environ['JAVAHOME'] = java_path
os.environ['JAVA_HOME'] = java_path
tagger_pos = StanfordPOSTagger(
    r'/home/tmpuser/stanford-postagger-2017-06-09/' +
    r'models/english-bidirectional-distsim.tagger',
    r'/home/tmpuser/stanford-postagger-2017-06-09/' +
    r'stanford-postagger.jar',
    java_options='-mx90000m')
pos_tag_list = [
    'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN',
    'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS',
    'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT',
    'WP', 'WP$', 'WRB', '#', '$', "''", '(', ')', ',', '.', ':', '``', 'e',
    'di', 'eff', 'anti-infl', 'classifi', '-'
]
pos_tag_dict = {key: value for value, key in enumerate(pos_tag_list)}
pos_tag_len = len(pos_tag_list)

root_dir = 'description/%s/' % sys.argv[1]
feature_folder = 'stylometry_features/%s/' % sys.argv[1]
        new_sen.append(stem_word)
    print " ".join(new_sen)


# stanford version
import nltk
from nltk.tag import StanfordPOSTagger
from nltk.tokenize import StanfordTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer


snowball_stemmer = SnowballStemmer('english')
wordnet_lemmatizer = WordNetLemmatizer()
tokenizer = StanfordTokenizer()
eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')


text="Dan's parents were overweight.,Dan was overweight as well.,The doctors told his parents it was unhealthy.,His parents understood and decided to make a change.,They got themselves and Dan on a diet.".split(',')


for sen in text:
    token_list=tokenizer.tokenize(sen[:-1])
    tagged_sen=eng_tagger.tag(token_list)
    new_sen=[]
    for (word,tag) in tagged_sen:
        # print word,tag
        if tag[0]=='V':
            
            lemma_word=wordnet_lemmatizer.lemmatize(word,pos='v')
        else:
'''
Created on 26 Mar 2018

@author: Owner
'''
import pandas as pd
from nltk.tag import StanfordPOSTagger
from nltk.corpus import stopwords
import nltk
from textblob import TextBlob
from pandas import DataFrame
from pandas import ExcelWriter

st = StanfordPOSTagger(
    '$/Multisimo/stanford-postagger-2018-02-27/stanford-postagger-2018-02-27/models/english-bidirectional-distsim.tagger',
    path_to_jar=
    '$/Multisimo/stanford-postagger-2018-02-27/stanford-postagger-2018-02-27/stanford-postagger.jar'
)
df_proc = pd.read_excel(
    '$/Multisimo/Multisimo_Data/Processed_transcripts.xlsx')
transcripts = df_proc['test_set'].tolist()

words = set(nltk.corpus.words.words())
eng_stopwords = set(stopwords.words('english'))

sentiment_list = []
nouns_list = []  #NN*
verbs_list = []  #VB*
personal_pronoun_list = []  #PRP
possessive_pronoun_list = []  #PRP$
wh_list = []  #W*
import nltk
import math
import numpy as np
from nltk.corpus import wordnet
from nltk.tag import StanfordPOSTagger

###########################################ICANTUSEPOSTAGGER
#from nltk.tag.stanford import POSTagger

#############
#Change path#
#############
java_path ="/usr/bin/java.exe"
###########################################ICANTUSEPOSTAGGER
#st=POSTagger('C:\\Users\\CGR\\Desktop\\stanford-postagger-2015-04-20\\stanford-postagger-2015-04-20\\models\\english-bidirectional-distsim.tagger','C:\\Users\\CGR\\Desktop\\stanford-postagger-2015-04-20\\stanford-postagger-2015-04-20\\stanford-postagger.jar',encoding='UTF-8')
st = StanfordPOSTagger('/home/jsrang02/english-bidirectional-distsim.tagger', '/home/jsrang02/stanford-postagger.jar')

os.environ['JAVAHOME'] = java_path
print("Please Input RTE Set Number:")
key=input()
#############
#Change path#
#############
if key=="1":
    tree = parse("/home/jsrang02/RTE/dev/dev.xml")
elif key=="2":
    tree = parse("/home/jsrang02/RTE/dev/dev2.xml")
elif key=="3":
    tree = parse("/home/jsrang02/RTE/dev/dev3.xml")
root = tree.getroot()
true_similarity=[]
Ejemplo n.º 30
0
def number(sentence):
    pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8' )
    tagged_sentence= pos_tagger.tag(sentence.split())
    numbers = [word for word,tag in tagged_sentence if tag == 'DET' and det_or_nb(word)=='nb']
    return(' '.join(numbers))
Ejemplo n.º 31
0
__author__ = 'Anirudh'

import codecs
import nltk
from nltk.tag import StanfordPOSTagger
nltk.internals.config_java("C:\Program Files\Java\jdk1.8.0_60\\bin\java.exe")

import os
java_path = "C:\Program Files\Java\jdk1.8.0_60\\bin\java.exe"
os.environ['JAVAHOME'] = java_path


# st = StanfordPOSTagger('english-bidirectional-distsim.tagger')

st = StanfordPOSTagger('D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\models\\arabic.tagger','D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\stanford-postagger.jar')
#st = StanfordPOSTagger('D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\models\\english-bidirectional-distsim.tagger','D:\Curriculum\Natural-Language-Processing\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09\stanford-postagger.jar')

file="arabic_in.txt"
source = codecs.open(file,"r","utf-16-be")
destination = codecs.open("utf8encoder_out.txt","wb","utf-8")
contents=source.read()
destination.write(contents)

destination = codecs.open("utf8encoder_out.txt","r","utf-8")
contents2=destination.read()

print contents2.split()

print st.tag(contents2.split())
Ejemplo n.º 32
0
import pickle

with open('./all_seqs.pkl', 'rb') as fh:
    train, valid, test = pickle.load(fh)
with open('./nli_tokenizer.pkl', 'rb') as fh:
    tokenizer = pickle.load(fh)
dict = {w: i for (w, i) in tokenizer.word_index.items()}
inv_dict = {i: w for (w, i) in dict.items()}
word_candidate = {}
trains = [t[1:-1] for t in train['s2']]
tests = [t[1:-1] for t in test['s2']]
from nltk.tag import StanfordPOSTagger
jar = 'stanford-postagger-2018-10-16/stanford-postagger.jar'
model = 'stanford-postagger-2018-10-16/models/english-left3words-distsim.tagger'
pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')

#from nltk.stem import WordNetLemmatizer
#wnl = WordNetLemmatizer()
#pos_tag=pos_tagger.tag(['what', "'s", 'invigorating', 'about', 'it', 'is', 'that', 'it', 'does', "n't", 'give', 'a', 'damn'])
#print(pos_tag)

train_text = [[inv_dict[t] for t in tt] for tt in trains]

all_pos_tags = []

for text in train_text:

    pos_tags = pos_tagger.tag(text)
    all_pos_tags.append(pos_tags)
f = open('pos_tags.pkl', 'wb')
pickle.dump(all_pos_tags, f)
Ejemplo n.º 33
0
 def __init__(self, ):
     super(Lemmatizer, self).__init__()
     self.basename = 'lemmatized'
     self.pos_tagger = StanfordPOSTagger('english-left3words-distsim.tagger', java_options='-mx1024m')
     self.lemmatizer = WordNetLemmatizer()
     self.max_length = 500
Ejemplo n.º 34
0
import re
from nltk.parse.stanford import StanfordDependencyParser
path_to_jar = r'C:\Users\Lovisa\Downloads\stanford-corenlp-full-2016-10-31\stanford-corenlp-full-2016-10-31\stanford-corenlp-3.7.0.jar'
path_to_models_jar = r'C:\Users\Lovisa\Downloads\stanford-corenlp-full-2016-10-31\stanford-corenlp-full-2016-10-31\stanford-corenlp-3.7.0-models.jar'
dependency_parser = StanfordDependencyParser(
    path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)
from nltk.tag import StanfordPOSTagger
st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
from nltk import word_tokenize

from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()


def read_ingredients(filename, ing_list, new_ingredients):
    ingredients = {}
    section = None
    with open('baking_recipes/' + filename, 'r') as f:
        for line in f:
            if "INGREDIENTS" in line:
                continue
            if "PREPARATION" in line:
                break
            ##Ignore any blank lines
            line = line.strip()
            if not line:
                continue

            if line.endswith(
                    ":"):  #structural assumption: sections end with colon
                section = line
Ejemplo n.º 35
0
from nltk.tag import StanfordPOSTagger
from nltk.tokenize import word_tokenize

MODEL_PATH = '/Users/yunhongmin/stanford-postagger-full/models/english-bidirectional-distsim.tagger'
JAR_PATH = '/Users/yunhongmin/stanford-postagger-full/stanford-postagger-3.9.2.jar'

pos_tagger = StanfordPOSTagger(MODEL_PATH, JAR_PATH)
text = 'If you unpack the tar file, you should have everything needed. This software provides a GUI demo, a command-line interface, and an API. Simple scripts are included to invoke the tagger. For more information on use, see the included README.txt.'

tokens = word_tokenize(text)
print(tokens)
print()
print(pos_tagger.tag(tokens))
Ejemplo n.º 36
0
# -*- coding: utf-8 -*-
from sys import argv
from nltk.tag import StanfordNERTagger
from nltk.tag import StanfordPOSTagger
from nltk.tokenize import sent_tokenize
import re
import sys
import logging

reload(sys)  
sys.setdefaultencoding('utf8')
script, filename, loadname = argv
logging.basicConfig(format='preprocess progress:%(message)s', level=logging.INFO)
NERTagger = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
POSTagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')
PRPList = ["He", "he", "She", "she", "His", "his", "Her", "him", "her", "him,", "him.", "her,", "her."]
monthElement = "january|february|march|april|may|june|july|august|september|october|november|december"
dateElement = "1|2|3|4|5|6|7|8|9|0"
monthPattern = re.compile(monthElement, re.IGNORECASE)
datePattern = re.compile(dateElement, re.IGNORECASE)

#month: return 1. year for sure: return 2:. correct date: 3. not date 0.
def dateJudge(datePair):
	dateString = datePair[0]
	dateTagger = datePair[1]
	if dateTagger == "CD":
		matchDate = re.findall(datePattern, dateString)
		if len(matchDate) == len(dateString):
			if int(dateString) > 31:
				return 2
			else:
Ejemplo n.º 37
0
""" Handles POS tagging"""

from nltk.tag import StanfordPOSTagger
from lib.utils import POS_TAGGERS_JAR_PATH, POS_TAGGERS_EN_MODEL_PATH, POS_TAGGERS_DE_MODEL_PATH

POS_TAGGER_EN = StanfordPOSTagger(POS_TAGGERS_EN_MODEL_PATH, POS_TAGGERS_JAR_PATH)
POS_TAGGER_DE = StanfordPOSTagger(POS_TAGGERS_DE_MODEL_PATH, POS_TAGGERS_JAR_PATH)

def get_pos_tags(tokenised_sentence, lang):
    """
    Given a tokenised sentence, returns sentence as array of tuples of (token, POS Tag)
    """

    if lang == 'en':
        pos_tagger = POS_TAGGER_EN
    elif lang == 'de':
        pos_tagger = POS_TAGGER_DE
    else:
        raise NotImplementedError('Pass in either en or de as language')

    return pos_tagger.tag(tokenised_sentence)
Ejemplo n.º 38
0
import frecuencias, filtros, spacy
from frecuencias import *
from filtros import *
from nltk import word_tokenize, sent_tokenize, bigrams, trigrams
from nltk.tag import StanfordPOSTagger
from docx import Document
#cd Documents\Repos\investigacion\source\

#Configuracion inicial del POSTagger de Stanford (verificar que la ruta sea la correcta, sino no funciona)
tagger = r'C:\Users\lau_9\Documents\Repos\investigacion\stanford\stanford-postagger\models\spanish.tagger'
jar = r'C:\Users\lau_9\Documents\Repos\investigacion\stanford\stanford-postagger\stanford-postagger.jar'
etiquetador = StanfordPOSTagger(tagger, jar)

#Abro el documento y extraigo las palabras
texto = ""  #Lista de palabras del texto, incluyendo stopwords
f = open(
    r'C:\Users\lau_9\Documents\Repos\investigacion\source\data\TextoEjemplo.docx',
    'rb')
document = Document(f)
for i in document.paragraphs:
    texto += i.text  #.lower()
f.close()

#Tokenizo y reformateo el texto, eliminando stopwords
#texto_ref = ' '.join(reformatearTexto(texto))
#palabras_ref = borrarStopwords(word_tokenize(texto_ref))
#print("TEXTO REFORMATEADO:\n" + texto.upper() + "\n")
#print("PALABRAS TOKENIZADAS Y SIN ETIQUETAR:\n" + str(palabras).upper() + "\n")

#Etiqueto a los tokens y muestro sus etiquetas
#palabras_etiquetadas = etiquetador.tag(palabras)
Ejemplo n.º 39
0
for path in paths:
    path_length = len(re.sub('/?[\[{].*?[\]}]', '', path).split('/')) - 1
    # if path starts with version info, length should minus 1
    if re.match('^/[v]?[0-9]', path):
        path_length -= 1
    path_lengths.append(path_length)
methods = [method for path in paths for method in swagger['paths'][path]]

# frequency statistic
fd_method = nltk.FreqDist(methods)
fd_path_length = nltk.FreqDist(path_lengths)

# part-of-speech judgement
home_path = '/home/pyx/Desktop/stanford-postagger-full-2015-12-09'
# home_path = '/Users/kenmick/Desktop/stanford-postagger-full-2015-12-09'
st = StanfordPOSTagger(home_path + '/models/english-left3words-distsim.tagger',
        home_path + '/stanford-postagger.jar')

url_noun = []
url_not_noun = []
pos = ['NN', 'NNS', 'IN', 'JJ', 'JJS', 'RB', 'TO', 'PRP', 'PRP$', 'NNP', 'NNPS', 'DT', 'VBG', 'VBN', 'VBD']
count = 1

for path in paths:
    print str(count) + '/' + str(len(paths))
    count += 1
    isNoun = True
    print path
    # remove parameters in path, such as {id}, [id], :id, and split url by level, namely by '/'
    urls = re.sub('/?[\[{].*?[\]}]|/:\w+', '', path).replace('.json', '').lstrip('/').split('/')
    for url in urls:
        for word_pos in st.tag(get_divided_url(url)):
Ejemplo n.º 40
0
with open('/media/ubuntu/Local Disk/MAJOR_PROJECT/REIA/mqueue.txt', 'r') as f:
    first_line = f.readline()

user_input = first_line.split(' ', 1)[1]
max_score = 0.1
map_val = ""
print("\nINPUT = ")
print(user_input)
label = classify(user_input)
suggest_list = []
suggest_message = ""
print("Classified as : " + str(label))
tokens = nltk.word_tokenize(user_input)
print(tokens)
st = StanfordPOSTagger(config['tagger']['model'],
                       path_to_jar=config['tagger']['path'])
stanford_tag = st.tag(user_input.split())
print("Tags")
print(stanford_tag)
with open(MAPPING_PATH, 'r') as data_file:
    data = json.load(data_file)
for i in data[label]:
    dist = jf.jaro_distance(unicode(str(user_input), encoding="utf-8"),
                            unicode(str(i), encoding="utf-8"))
    suggest_list.append(tuple((dist, i)))
    print(dist)
    if (dist > max_score):
        max_score = dist
        map_val = i
if max_score < config['preferences']['similarity_threshold']:
    post_message(
Ejemplo n.º 41
0
def build_data_cv(data_folder, cv=10, clean_string=True):
    """
    Loads data and split into 10 folds.
    """
    revs = []
    pos_file = data_folder[0]
    neg_file = data_folder[1]
    vocab = defaultdict(float)
    pos_vocab = defaultdict(float)
    pos_tagger = StanfordPOSTagger(
        'pos-tag/english-left3words-distsim.tagger',
        'pos-tag/stanford-postagger.jar',
        'utf8', False, '-mx2000m')

    with open(pos_file, "rb") as f:
        revs_text = []
        for line in f:       
            rev = []
            rev.append(line.strip())
            if clean_string:
                orig_rev = clean_str(" ".join(rev))
            else:
                orig_rev = " ".join(rev).lower()
            revs_text.append(orig_rev.split())

        revs_tagged = pos_tagger.tag_sents(revs_text)

        for rev_tagged in revs_tagged:
            text = list(zip(*rev_tagged)[0])
            tag = list(zip(*rev_tagged)[1])
            words = set(text)
            for word in words:
                vocab[word] += 1
            postags = set(tag)
            for postag in postags:
                pos_vocab[postag] += 1
            datum = {"y": 1,
                     "text": ' '.join(text),
                     "tag": ' '.join(tag),
                     "num_words": len(text),
                     "split": np.random.randint(0, cv)}
            revs.append(datum)

    with open(neg_file, "rb") as f:
        revs_text = []
        for line in f:
            rev = []
            rev.append(line.strip())
            if clean_string:
                orig_rev = clean_str(" ".join(rev))
            else:
                orig_rev = " ".join(rev).lower()
            revs_text.append(orig_rev.split())

        revs_tagged = pos_tagger.tag_sents(revs_text)

        for rev_tagged in revs_tagged:
            text = list(zip(*rev_tagged)[0])
            tag = list(zip(*rev_tagged)[1])
            words = set(text)
            for word in words:
                vocab[word] += 1
            postags = set(tag)
            for postag in postags:
                pos_vocab[postag] += 1
            datum = {"y": 0,
                     "text": ' '.join(text),
                     "tag": ' '.join(tag),
                     "num_words": len(text),
                     "split": np.random.randint(0, cv)}
            revs.append(datum)

    return revs, vocab, pos_vocab
Ejemplo n.º 42
0
from lexnlp.nlp.en.tokens import STOPWORDS, get_lemma_list
from lexnlp.config.stanford import STANFORD_POS_PATH

__author__ = "ContraxSuite, LLC; LexPredict, LLC"
__copyright__ = "Copyright 2015-2018, ContraxSuite, LLC"
__license__ = "https://github.com/LexPredict/lexpredict-lexnlp/blob/master/LICENSE"
__version__ = "0.2.3"
__maintainer__ = "LexPredict, LLC"
__email__ = "*****@*****.**"

# Setup Stanford POS configuration
STANFORD_POS_FILE = os.path.join(STANFORD_POS_PATH, "stanford-postagger.jar")
STANFORD_TOKENIZER = StanfordTokenizer(path_to_jar=STANFORD_POS_FILE)
STANFORD_DEFAULT_TAG_MODEL = os.path.join(
    STANFORD_POS_PATH, "models", "english-bidirectional-distsim.tagger")
STANFORD_TAGGER = StanfordPOSTagger(STANFORD_DEFAULT_TAG_MODEL,
                                    STANFORD_POS_FILE)


def get_tokens_list(text, lowercase=False, stopword=False) -> List:
    """
    Get token list form text using Stanford libraries.
    :param text:
    :param lowercase:
    :param stopword:
    :return:
    """
    if not is_stanford_enabled():
        raise RuntimeError(
            "USE_STANFORD is set to False.  No Stanford functionality available."
        )
Ejemplo n.º 43
0
# 汉语命名实体识别
ch_tagger = StanfordNERTagger(
    r'/Users/cln/stanford-corenlp/stanford-ner/classifiers/chinese.kbp.distsim.crf.ser.gz',
    path_to_jar=r'/Users/cln/stanford-corenlp/stanford-ner/stanford-ner.jar')

texts = r"欧洲 东部 的 罗马尼亚 首都 是 布加勒斯特 也 是 一 座 世界性 的 城市 北京 南阳 普京 中国 习主席"
ch_rst = ch_tagger.tag(texts.split())

print('汉语命名实体识别:\n', ch_rst, '\n')

from nltk.tag import StanfordPOSTagger

# 汉语词性标注
chi_tagger = StanfordPOSTagger(
    r'/Users/cln/stanford-corenlp/postagger/models/chinese-distsim.tagger',
    path_to_jar=r'/Users/cln/stanford-corenlp/postagger/stanford-postagger.jar'
)
print("汉语词性标注:")
print(chi_tagger.tag(ch_result.split()))

for _, word_and_tag in chi_tagger.tag(ch_result.split()):
    word, tag = word_and_tag.split('#')
    print(word, tag)

print('\n')

from nltk.parse.stanford import StanfordParser
from nltk import Tree
# 汉语句法分析
chi_parser = StanfordParser(
    r"/Users/cln/stanford-corenlp/parser/stanford-parser.jar",
Ejemplo n.º 44
0
def stopw(document):
    stop = stopwords.words('english')
    res = ' '.join([i for i in document.split() if i not in stop])
    return res


text = """
harpdog brown is a singer and harmonica player who has been active in canadas blues scene since 1982 hailing from vancouver he crossed tens of thousands of miles playing club dates and festivals in canada the northwestern united states and germanyover the years he has issued seven cds in 1995 his home is where the harp is won the muddy award for the best nw blues release from the cascade blues association in portland oregon as well that year it was nominated for a canadian juno for the best bluesgospel recording teamed up with graham guest on piano his cd naturally was voted 1 canadian blues album of 2010 by the blind lemon surveybrown tours extensively with his guitarist j arthur edmonds performing their electric mid1950s chicago blues either as a duo or with the full band while he is home he juggles a few combos working many venues big and small he also leads the harpdog brown band which is a gutsy traditional chicago blues band in 2014 they released what it is comprising mainly original songs and a few classic covers influential blues promoter and broadcaster holger petersen called what it is browns best albumhe was just awarded the maple blues award in toronto for best harmonica player in canada 2014 and was honored with a life time membership to the hamilton blues society
"""

# Add the jar and model via their path (instead of setting environment variables):
jar = 'D:\\Python\\SRC\\stanford-postagger-full-2018-10-16\\stanford-postagger.jar'

model = 'D:\\Python\\SRC\\stanford-postagger-full-2018-10-16\\models\\english-left3words-distsim.tagger'

pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')

clean = stopw(text)
text = pos_tagger.tag(word_tokenize(clean))
print(text)

st = StanfordNERTagger(
    'D:\\Python\\SRC\\stanford-ner-2018-10-16\\classifiers\\english.all.3class.distsim.crf.ser.gz',
    'D:\\Python\\SRC\\stanford-ner-2018-10-16\\stanford-ner.jar',
    encoding='utf-8')

initText = clean.title()
tokenized_text = word_tokenize(initText)
tokenized_text
classified_text = st.tag(tokenized_text)
classified_text
Ejemplo n.º 45
0
 def __init__(self):
   self.feat_index = {}
   self.implication_words = ["demonstrate", "suggest", "indicate"]
   self.hyp_words = ["possible"]
   self.method_words = ["probe", "detect"]
   self.pos_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')
Ejemplo n.º 46
0
def build_data_cv(data_file, all_phrases, binary, min_len=4):
    revs = []
    vocab = defaultdict(float)
    pos_vocab = defaultdict(float)
    pos_tagger = StanfordPOSTagger(
        'pos-tag/english-left3words-distsim.tagger',
        'pos-tag/stanford-postagger.jar',
        'utf8', False, '-mx2000m')
    splits = ['train', 'test', 'dev']
    sentence_set = set()

    for split in splits:
        with open(data_file.format(split), "rb") as f:
            reader = csv.reader(f)
            revs_text = []
            sents = []
            for row in reader:
                rev, sent = row[0], int(row[1])
                if binary and sent == 2:  # skip neutral if binary
                    continue
                rev = clean_str_sst(rev)
                if split == 'train':
                    sentence_set.add(rev)
                rev_tokens = rev.split()
                revs_text.append(rev_tokens)
                sent = sentiment_label_for_binary(sent) if binary else sent  # check for binary case
                sents.append(sent)
            revs_tagged = pos_tagger.tag_sents(revs_text)
            for i in range(len(revs_tagged)):
                rev_tagged = revs_tagged[i]
                text = list(zip(*rev_tagged)[0])
                tag = list(zip(*rev_tagged)[1])
                for word in set(text):
                    vocab[word] += 1
                for postag in set(tag):
                    pos_vocab[postag] += 1
                rev_datum = {"y": sents[i],
                             "text": ' '.join(text),
                             "tag": ' '.join(tag),
                             "num_words": len(text),
                             "split": get_split_num(split)}
                revs.append(rev_datum)

    if all_phrases:
        with open(data_file.format("train_phrases"), "rb") as f:
            reader = csv.reader(f)
            revs_text = []
            sents = []
            count = 0
            for row in reader:
                rev, sent = row[0], int(row[1])
                rev = clean_str_sst(rev)
                if rev in sentence_set:
                    count += 1
                    continue
                if binary and sent == 2:  # skip neutral if binary
                    continue
                rev_tokens = rev.split()
                if len(rev_tokens) < min_len:
                    continue
                revs_text.append(rev_tokens)
                sent = sentiment_label_for_binary(sent) if binary else sent  # check for binary case
                sents.append(sent)
            revs_tagged = pos_tagger.tag_sents(revs_text)
            for i in range(len(revs_tagged)):
                rev_tagged = revs_tagged[i]
                text = list(zip(*rev_tagged)[0])
                tag = list(zip(*rev_tagged)[1])
                for word in set(text):
                    vocab[word] += 1
                for postag in set(tag):
                    pos_vocab[postag] += 1
                rev_datum = {"y": sents[i],
                             "text": ' '.join(text),
                             "tag": ' '.join(tag),
                             "num_words": len(text),
                             "split": get_split_num('train')}
                revs.append(rev_datum)

            print "{} sentences in phrases".format(count)

    return revs, vocab, pos_vocab
Ejemplo n.º 47
0
from nltk.tag import StanfordPOSTagger
from nltk.tokenize import word_tokenize

STANFORD_POS_MODEL_PATH = '압축을 푼 장소/models/english-bidirectional-distsim.tagger'
STANFORD_POS_JAR_PATH = '압축을 푼 장소/stanford-postagger-3.6.0.jar'

pos_tagger = StanfordPOSTagger(STANFORD_POS_MODEL_PATH, STANFORD_POS_JAR_PATH)

# 임의로 만들어낸 예제입니다. 이 부분을 원하는 문장으로 바꿔서 실습하세요.
text = 'One day in November 2016, the two authors of this book, Seungyeon and Youngjoo, had a coffee at Red Rock cafe, which is a very popular place in Mountain View.'

tokens = word_tokenize(text)
print(tokens)  # 쪼개진 토큰을 출력합니다.
print()
print(pos_tagger.tag(tokens))  # 품사 태깅을 하고 그 결과를 출력합니다.

# 동사와 명사만 뽑아봅시다.
noun_and_verbs = []
for token in pos_tagger.tag(tokens):
    if token[1].startswith('V') or token[1].startswith('N'):
        noun_and_verbs.append(token[0])
print(', '.join(noun_and_verbs))
Ejemplo n.º 48
0
# encoding: utf-8
import nltk
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.tag import StanfordPOSTagger

segmenter=StanfordSegmenter(
    #分词依赖的jar包
    path_to_jar=r"/home/jiangix/document/stanford-segmenter/stanford-segmenter.jar",
    path_to_slf4j=r"/home/jiangix/document/slf4j/slf4j-api.jar",
    #分词数据文件夹
    path_to_sihan_corpora_dict=r"/home/jiangix/document/stanford-segmenter/data",
    #基于北大在2005backoof上提供的人名日报语料库
    path_to_model=r"/home/jiangix/document/stanford-segmenter/data/pku.gz",
    path_to_dict=r"/home/jiangix/document/stanford-segmenter/data/dict-chris6.ser.gz"
    )

segmenter.default_config('zh')
result=segmenter.segment(u'我喜欢学习编程')

chi_tagger = StanfordPOSTagger(
    model_filename=r"/home/jiangix/document/stanford-postagger/models/chinese-distsim.tagger",
    path_to_jar=r"/home/jiangix/document/stanford-postagger/stanford-postagger.jar")

print(chi_tagger.tag(result.split()))