Python map_tagの例、nltk.map_tag Pythonの例

コード例 #1

0

ファイルを表示

ファイル: machine_learning2.0.py プロジェクト: MarloesKuijper/native_lang_and_gender_identification

def get_pos_distro_features(train, test=None, within=False):
    if within:
        X_train, X_test, y_train, y_test = split_dataset_within(
            train, int(args.length), False, False)
    else:
        X_train, X_test, y_train, y_test = split_dataset_cross(
            train, test, False, False)
        X_train, y_train = shuffle(X_train, y_train, random_state=42)
        X_test, y_test = shuffle(X_test, y_test, random_state=42)

    X_train_pos = []
    for item in X_train:
        item_pos = nltk.pos_tag(item.split())
        item_mapped = " ".join([
            nltk.map_tag("en-ptb", "universal", tag) for word, tag in item_pos
        ])
        X_train_pos.append(item_mapped)

    X_test_pos = []
    for item in X_test:
        item_pos = nltk.pos_tag(item.split())
        item_mapped = " ".join([
            nltk.map_tag("en-ptb", "universal", tag) for word, tag in item_pos
        ])
        X_test_pos.append(item_mapped)

    X_pos = X_train_pos + X_test_pos
    y_pos = y_train + y_test

    get_pos_distribution_features(X_pos,
                                  y_pos,
                                  "pos_tag_distributions.txt",
                                  model_name=args.model_name,
                                  output_type=args.output_type)

コード例 #2

0

ファイルを表示

ファイル: GrammarModel.py プロジェクト: sofiabroome/wordpredictor

    def predict_next_word(self, model, base_query_string):
        # Returns the same matches than the argument model whith a probability which
        # takes into account the grammar tag of the words of the base_query_string

        # First, get the tag sequence associated to the word sequence
        text = nltk.word_tokenize(base_query_string)
        tagged = nltk.pos_tag(text)

        # Transform it according to the universal tagset
        simplified = [nltk.map_tag('en-ptb', 'universal', tag) for word, tag in tagged]
        blank = " "
        simple = blank.join(simplified)

        # Predict the possible tags and their probabilities after this tag sequence
        # according to our Grammar Ngrams model
        tag_matches = self.predict_next_tag(simple)

        if len(tag_matches) == 0:
            empty = []
            return empty
        # Keep only the predicted tag (the last one) of each match
        tags = [(tuple[-1], prob) for tuple, prob in list(set(tag_matches))]

        # Predict the possible tags and their probabilities after the word sequence
        # according the Word Ngrams model argument of this method
        word_matches = model.predict_next_word(base_query_string)

        # Keep only the predicted word (the last one) of each match
        words = [(tuple[-1], prob) for tuple, prob in list(set(word_matches))]

        # Fetch the probability of the tag assoiated to each predicted world
        prior = []
        for i in range(len(words)):
            word, prob = words[i]

            # Tag the predicted word
            tag = nltk.pos_tag(nltk.word_tokenize(word))
            universal_tag = nltk.map_tag('brown', 'universal', tag[0][1])

            # Take the probabilities of the equal tag in our GrammarModel predictions or 0
            # if this tag is not possible after our tag sequence
            p = 0
            for j in range(len(tags)):
                if universal_tag == tags[j][0]:
                    p = tags[j][1]
            prior.append(p)

        # Return the same matches than our Word NgramModel before, with a new probability
        better_matches = []
        for i in range(len(prior)):
            tuple, prob = list(set(word_matches))[i]
            better_matches.append((tuple, prob * prior[i]))
        prob_sum = sum(m[-1] for m in better_matches)
        better_matches = [[m[0], m[-1] / prob_sum] for m in better_matches]
        return better_matches

コード例 #3

0

ファイルを表示

ファイル: preprocessor.py プロジェクト: jessyarbrough/mech-jung

def transform(doc):
	doc = word_tokenize(doc)
	for i in range(0, len(doc)):
		token = doc[i]
		token.rstrip("'")
		if not token in expansions and not token in unexpandables:
			token = token.lstrip("'")
		doc[i] = token
	doc = pos_tag(doc)
	doc = [(map_token(token), map_tag('en-ptb', 'universal', tag)) for token, tag in doc]
	doc_by_process = {}
	for process in processes:
		processed_doc = None
		if process == 'parts_all':
			processed_doc = [tag for token, tag in doc]
		elif process == 'tokens_all':
			processed_doc = [token for token, tag in doc]
		elif process == 'tokens_dense':
			tags = primary_tags - set(['VERB', 'ADV', 'PRON'])
			processed_doc = [wnl.lemmatize(token) for token, tag in doc if tag in tags]
		elif process == 'tokens_other':
			processed_doc = [token for token, tag in doc if tag not in primary_tags]
		else:
			tags = set([process.split('_')[1].upper()])
			processed_doc = [token for token, tag in doc if tag in tags]
		s = ''
		for i in range(0, len(processed_doc)):
			s += processed_doc[i]
			if i != len(processed_doc) - 1:
				s += ' '
		doc_by_process[process] = s
	return doc_by_process

コード例 #4

0

ファイルを表示

ファイル: multi_task_learning_experiment.py プロジェクト: MarloesKuijper/native_lang_and_gender_identification

def preprocess_text(text, affix, country, pos_tagging=False):
    # make sure you avoid counting URL and NUM as capitalized words (for e.g. German)
    # maybe @username?, maybe check again for RTs?
    with open("fileyouwontneed.txt", "a", encoding="utf-8") as outfile:
        tokenizer = TweetTokenizer(reduce_len=True, preserve_case=True)
        cleanr = re.compile('<.*?>')
        remove_markup = re.sub(cleanr, '', text)
        replace_urls = re.sub(r"http\S+", "URL", remove_markup)
        replace_digits = re.sub(r'\d+', "NUM", replace_urls)
        if affix.endswith("@"):
            replace_digits = re.sub(r'\.([a-zA-Z])', r'. \1', replace_digits)
        text = tokenizer.tokenize(replace_digits)
        if pos_tagging:
            original_text = text
            text = nltk.pos_tag(text)
            text = " ".join([
                nltk.map_tag("en-ptb", "universal", tag) for word, tag in text
            ])
            #text = " ".join([item[1] if item[1].startswith("NN") or item[1].startswith("VB") else item[0] for item in text])

            outfile.write("{0},{1},{2}\n".format(" ".join(original_text),
                                                 " ".join(text), country))
        else:
            text = " ".join([item for item in text])

        return text

コード例 #5

0

ファイルを表示

ファイル: text_analysis_feature_generator.py プロジェクト: aviade5/salmon_clickbait_detector

 def _get_meaning_set(self, sentence):
     if sentence is None or sentence == u'':
         return None
     tags = word_tokenize(sentence)
     proccessed_tags = nltk.pos_tag(tags)
     simplified_tags = [(word, map_tag('en-ptb', 'universal', tag))
                        for word, tag in proccessed_tags]
     return simplified_tags

コード例 #6

0

ファイルを表示

 def tag_histogram(text):
     tokenized_text = nltk.word_tokenize(text)
     tagged_text = nltk.pos_tag(tokenized_text)
     simplified_tagged_text = [(word,
                                nltk.map_tag('en-ptb', 'universal', tag))
                               for word, tag in tagged_text]
     tagdict = defaultdict(int)
     for word in simplified_tagged_text:
         tagdict[word[1]] += 1
     return tagdict

コード例 #7

0

ファイルを表示

def tag_pos(x):
    sentences = sent_tokenize(x)
    sents = []
    for s in sentences:
        text = word_tokenize(s)
        pos_tagged = pos_tag(text)
        simplified_tags = [(word, map_tag('en-ptb', 'universal', tag))
                           for word, tag in pos_tagged]
        sents.append(simplified_tags)
    return sents

コード例 #8

0

ファイルを表示

def POSDensitySimple(array):
    tagged = nltk.pos_tag(array)
    simplifiedTags = [(word, nltk.map_tag('en-ptb', 'universal', tag))
                      for word, tag in tagged]
    s = len(array)
    counts = dict(Counter(tag for word, tag in simplifiedTags))
    #counts = collections.UserDict(counts)
    for k in counts.keys():
        counts[k] *= 1.0 / s
        counts[k] = '%.4f' % (counts[k])
    return (counts)

コード例 #9

0

ファイルを表示

ファイル: lang_processor.py プロジェクト: akanshamehta17/Chat-Bot-Cmpe-273

def removeUnwantedWords(input):
    userInputWithOnlyQuestionAndKeywords = []
    posTagged = nltk.pos_tag(input)
    simplifiedTags = [(word, nltk.map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged]
    for key,value in simplifiedTags:
        if (key.lower() in config.questionList) or (value not in 'ADP' and value not in 'PRON' and value not in 'DET' and value not in 'CONJ' and value not in 'PRT' and key not in 'is'):
            userInputWithOnlyQuestionAndKeywords.append(key)
        else:
            log.writetofile("blacklisted word: " + key)

    return userInputWithOnlyQuestionAndKeywords

コード例 #10

0

ファイルを表示

ファイル: Core.py プロジェクト: ErfanThinker/PAN19

def extract_words_plus_pos_tags(texts, lang):
    results = []
    if lang in stanford_lang_models:
        import nltk.tag.stanford as stanford_tagger
        tagger = stanford_tagger.StanfordPOSTagger(
            stanford_res_path + stanford_lang_models[lang],
            path_to_jar=stanford_res_path + "stanford-postagger.jar")
        results = tagger.tag(word_tokenize(texts, language=lang_map[lang]))
        if lang == 'en':  # convert eng tags to universal tags
            results = [(word, map_tag('en-ptb', 'universal', tag))
                       for word, tag in results]

    return results

コード例 #11

0

ファイルを表示

ファイル: GrammarModel.py プロジェクト: sofiabroome/wordpredictor

 def train(self, words, tagged=False):
     if tagged is True:
         tags = []
         for i in range(len(words)):
             tags.append(words[i][1])
         self.ngrams = list(nltk.ngrams(tags, self.n))
     else:
         # text = nltk.word_tokenize(words)
         tagged_words = nltk.pos_tag(words)
         universal_tags = [nltk.map_tag('en-ptb', 'universal', tag) for word, tag in tagged_words]
         self.ngrams = list(nltk.ngrams(universal_tags, self.n))
     self.frequencies = nltk.FreqDist(self.ngrams)
     self.probs_ng = nltk.MLEProbDist(self.frequencies)
     print self.probs_ng

コード例 #12

0

ファイルを表示

def pos_tagging(tweet):
    """ Finds the pos tag with nltk post_tag function, and then maps them with
    a tag required for the lemmatize function.

    :param tweet: list of words (tokens) represented like strings
    :type tweet: list
    :return: list of tuple (word, tag)
    :rtype: list
    """
    dict_tags = {
        'ADJ': 'a',
        'ADJ_SAT': 's',
        'ADV': 'r',
        'NOUN': 'n',
        'VERB': 'v'
    }

    tokens_tags = [
        (tokens[0], dict_tags[map_tag('en-ptb', 'universal', tokens[1])])
        if map_tag('en-ptb', 'universal', tokens[1]) in dict_tags else
        (tokens[0], '') for tokens in pos_tag(tweet)
    ]

    return tokens_tags

コード例 #13

0

ファイルを表示

ファイル: POS_tagging.py プロジェクト: pegahani/REUS

    def sentence_2_pos(self, sent):

        porter = PorterStemmer()
        #without stemming
        #text = nltk.word_tokenize(sent)

        #with stemming
        text = [porter.stem(word) for word in word_tokenize(sent)]
        posTagged = nltk.pos_tag(text)
        words_tags = [(word, nltk.map_tag('en-ptb', 'universal', tag))
                      for word, tag in posTagged]

        words = [item[0] for item in words_tags]
        tags = [item_[1] for item_ in words_tags]

        pos_onehot = [self.one_hot_POS(i) for i in tags]
        pos = list(np.sum(np.array(pos_onehot), axis=0))

        return (words, pos)

コード例 #14

0

ファイルを表示

 def lemmatizeTokens(self, tokens):
   tokens_tagged = pos_tag(tokens)
   tokens_simpleTags = [(word, map_tag('en-ptb', 'universal', tag)) 
     for word, tag in tokens_tagged]
   
   #Actually lemmatize.
   lemmas = []
   for token, tag in tokens_simpleTags:
     lemmatized = ""
     if tag == "VERB":
       lemmatized = self.lemmatizer.lemmatize(token, pos='v')
     elif tag == "ADJ":
       lemmatized = self.lemmatizer.lemmatize(token, pos='a')
     elif tag == "ADV":
       lemmatized = self.lemmatizer.lemmatize(token, pos='r')
     else:
       lemmatized = self.lemmatizer.lemmatize(token) #pos = 'n'
     lemmas.append(lemmatized)
   return lemmas

コード例 #15

0

ファイルを表示

ファイル: Lemmatizer.py プロジェクト: LukeLindsey/WhistleblowerAnalysis

	def lemmatizeTokens(self, tokens):
		tokens_tagged = pos_tag(tokens)
		#Get simple POS tags.
		tokens_simpleTags = [(word, map_tag('en-ptb', 'universal', tag)) 
			for word, tag in tokens_tagged]
		
		#Actually lemmatize.
		lemmas = []
		for token, tag in tokens_simpleTags:
			lemmatized = ""
			if tag == "VERB":
				lemmatized = self.lemmatizer.lemmatize(token, pos='v')
			elif tag == "ADJ":
				lemmatized = self.lemmatizer.lemmatize(token, pos='a')
			elif tag == "ADV":
				lemmatized = self.lemmatizer.lemmatize(token, pos='r')
			else:
				lemmatized = self.lemmatizer.lemmatize(token) #pos = 'n'
			lemmas.append(lemmatized.encode("utf-8"))
		return lemmas

コード例 #16

0

ファイルを表示

def pos_tagger(data):
    """
    does pos tagging on the input text file

    Parameters: 

    data: str
        text file

    Returns: list
        pos tagged list of words
    """
    sentences = sent_tokenize(data)
    sents = []
    for s in sentences:
        text = word_tokenize(s)
        pos_tagged = pos_tag(text)
        sub_tags = [(word, map_tag('en-ptb', 'universal', tag))
                    for word, tag in pos_tagged]
        sents.append(sub_tags)
    return sents

コード例 #17

0

ファイルを表示

 def runSingleWords(self):
     percentage = float(self.getPluginParamValue("Percentage")) / 100.0
     minCharLength = int(self.getPluginParamValue("MinCharLength"))
     posFilter = self.getPluginParamValue("POS")
     inputContent = self.getInputContent().lower()
     punctuation = string.punctuation.replace("-", "")
     puncFilter = dict((ord(char), None) for char in punctuation)
     tokens = nltk.word_tokenize(inputContent.translate(puncFilter))
     tokensCnt = len(tokens)
     if tokensCnt < 1:
         self.raiseException("No words found")
     maxTokensCnt = int(percentage * tokensCnt)
     tags = nltk.pos_tag(tokens)
     pos = [(token, nltk.map_tag('en-ptb', 'universal', tag))
            for token, tag in tags]
     filteredTokens1 = []
     for p in pos:
         if len(p[0]) < minCharLength:
             continue
         if p[1] not in posFilter:
             continue
         filteredTokens1.append(p)
     freqTokens = nltk.FreqDist(tokens)
     content = ""
     cnt = 0
     for freqToken in freqTokens.most_common(tokensCnt):
         for token in filteredTokens1:
             if freqToken[0] == token[0]:
                 content = "{0}\n{1},{2},{3}".format(
                     content, token[0], token[1], freqToken[1])
                 cnt += 1
                 break
         if cnt >= maxTokensCnt:
             break
     content = content.strip()
     self.setAnalyzerContent(content)
     return content

コード例 #18

0

ファイルを表示

ファイル: parse_tags.py プロジェクト: OminiaVincit/AmazonReviewAnalysis

def token_parse_amz(categ, path):
  if categ == 'Yelp' or categ == 'Tripadvisor':
    return
  done = 0
  start = time.time()
    # Load stopwords and tokenizer
  stopwds = stopwords.words('english')
  tokenizer = regexp.RegexpTokenizer("[\w']+", flags=re.UNICODE)

  with open(path, 'r') as g:
    for l in g:
      u = json.loads(json.dumps(eval(l)))
      if not (u.get('reviewerID') and u.get('asin') and u.get('reviewerName') \
              and u.get('helpful') and u.get('reviewText')):
        continue
      if u['helpful'][1] < 10:
        continue
      sentences = sent_tokenize(u['reviewText'])
      num_sent = len(sentences)
      num_tokens = 0
      num_pos = 0
      num_neg = 0
      sent_len = 0
      words = []
      for sentence in sentences:
        sent_len += len(sentence)
        tokens = tokenizer.tokenize(sentence)
        num_tokens += len(tokens)

        pos_tagged = pos_tag(tokens)
        simplified_tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in pos_tagged]
        for word, tag in simplified_tags:
          words.append({'word': word, 'pos': tag})
          tf = tag[0].lower()
          if tag == 'ADV':
            tf = 'r'
          if tag == 'NP' or tag == 'NUM':
            tf = tag # No need to calculate positive score

          if tf in ['a', 'v', 'r', 'n']:
              try:
                  sen_ls = swn.senti_synsets(word, tf)
                  if len(sen_ls) != 0:
                      sen_score = sen_ls[0]
                      pos_score = sen_score.pos_score()
                      neg_score = sen_score.neg_score()
                      # obj_score = sen_score.obj_score()
                      if pos_score > neg_score:
                          num_pos += 1
                      if pos_score < neg_score:
                          num_neg += 1
              except WordNetError:
                  pass

      if num_sent != 0:
        sent_len = sent_len / num_sent

      tag = {}
      tag['num_sent'] = num_sent
      tag['sent_len'] = sent_len      
      tag['num_tokens'] = num_tokens
      tag['num_pos'] = num_pos
      tag['num_neg'] = num_neg
      tag['words'] = words
      tag['review_id'] = u['reviewerID']
      tag['user_id'] = u['reviewerName']
      tag['item_id'] = u['asin']
      tag['votes'] = int(u['helpful'][1])
      tag['helpful'] = int(u['helpful'][0])
      done += 1
      if done % 1000 == 0:
        tmp = time.time() - start
        print categ, 'Tagging reviews, Done ', done, ' in', tmp
      yield str(tag)

コード例 #19

0

ファイルを表示

def apply_syntactic_filters(pos_tagged_tokens, syntactic_filters):
    tags = [(word, map_tag('en-ptb', 'universal', tag))
            for word, tag in pos_tagged_tokens]
    return [word.lower() for (word, tag) in tags if tag in syntactic_filters]

コード例 #20

0

ファイルを表示

ファイル: parse_tags.py プロジェクト: OminiaVincit/AmazonReviewAnalysis

def token_parse_yelp(categ, path):
    if categ != "Yelp" and categ != "Tripadvisor":
        return
    done = 0
    start = time.time()
    # Load stopwords and tokenizer
    stopwds = stopwords.words("english")
    tokenizer = regexp.RegexpTokenizer("[\w']+", flags=re.UNICODE)

    with open(path, "r") as g:
        for l in g:
            u = json.loads(json.dumps(eval(l)))
            if not (
                u.get("review_id")
                and u.get("user_id")
                and u.get("item_id")
                and u.get("helpful")
                and u.get("votes")
                and u.get("text")
            ):
                continue
            if int(u["votes"]) < 10:
                continue
            sentences = sent_tokenize(u["text"])
            num_sent = len(sentences)
            num_tokens = 0
            num_pos = 0
            num_neg = 0
            sent_len = 0
            words = []
            for sentence in sentences:
                sent_len += len(sentence)
                tokens = tokenizer.tokenize(sentence)
                num_tokens += len(tokens)

                pos_tagged = pos_tag(tokens)
                simplified_tags = [(word, map_tag("en-ptb", "universal", tag)) for word, tag in pos_tagged]
                for word, tag in simplified_tags:
                    words.append({"word": word, "pos": tag})
                    tf = tag[0].lower()
                    if tag == "ADV":
                        tf = "r"
                    if tag == "NP" or tag == "NUM":
                        tf = tag  # No need to calculate positive score

                    if tf in ["a", "v", "r", "n"]:
                        try:
                            sen_ls = swn.senti_synsets(word, tf)
                            if len(sen_ls) != 0:
                                sen_score = sen_ls[0]
                                pos_score = sen_score.pos_score()
                                neg_score = sen_score.neg_score()
                                # obj_score = sen_score.obj_score()
                                if pos_score > neg_score:
                                    num_pos += 1
                                if pos_score < neg_score:
                                    num_neg += 1
                        except WordNetError:
                            pass

            if num_sent != 0:
                sent_len = sent_len / num_sent

            tag = {}
            tag["num_sent"] = num_sent
            tag["sent_len"] = sent_len
            tag["num_tokens"] = num_tokens
            tag["num_pos"] = num_pos
            tag["num_neg"] = num_neg
            tag["words"] = words
            tag["review_id"] = u["review_id"]
            tag["user_id"] = u["user_id"]
            tag["item_id"] = u["item_id"]
            tag["votes"] = int(u["votes"])
            tag["helpful"] = int(u["helpful"])
            done += 1
            if done % 100 == 0:
                tmp = time.time() - start
                print categ, "Tagging reviews, Done ", done, " in", tmp
                # break
            yield str(tag)

コード例 #21

0

ファイルを表示

ファイル: p2_exercise2.py プロジェクト: chrispool/PTA

	def manualTagNltk(self):
		sentence = 'Marley was dead : to begin with . There is no doubt whatever about that .'
		tokens = nltk.word_tokenize(sentence)
		taggedText = [(word, nltk.map_tag('brown', 'universal', tag) ) for word, tag in self.manualTagBrown()]
		return list(taggedText)

コード例 #22

0

ファイルを表示

ファイル: solution2.py プロジェクト: Ujjwal29/sentiment-Analysis

def pos_senti(df_copy):  #takes
    li_swn = []
    li_swn_pos = []
    li_swn_neg = []
    missing_words = []
    for i in range(len(df_copy.index)):
        text = df_copy.loc[i]['tidy_tweet']
        tokens = nltk.word_tokenize(text)
        tagged_sent = nltk.pos_tag(tokens)
        store_it = [(word, nltk.map_tag('en-ptb', 'universal', tag))
                    for word, tag in tagged_sent]
        #print("Tagged Parts of Speech:",store_it)

        pos_total = 0
        neg_total = 0
        for word, tag in store_it:
            if (tag == 'NOUN'):
                tag = 'n'
            elif (tag == 'VERB'):
                tag = 'v'
            elif (tag == 'ADJ'):
                tag = 'a'
            elif (tag == 'ADV'):
                tag = 'r'
            else:
                tag = 'nothing'

            if (tag != 'nothing'):
                concat = word + '.' + tag + '.01'
                try:
                    this_word_pos = swn.senti_synset(concat).pos_score()
                    this_word_neg = swn.senti_synset(concat).neg_score()
                    #print(word,tag,':',this_word_pos,this_word_neg)
                except Exception as e:
                    wor = lem.lemmatize(word)
                    concat = wor + '.' + tag + '.01'
                    # Checking if there's a possiblity of lemmatized word be accepted into SWN corpus
                    try:
                        this_word_pos = swn.senti_synset(concat).pos_score()
                        this_word_neg = swn.senti_synset(concat).neg_score()
                    except Exception as e:
                        wor = pstem.stem(word)
                        concat = wor + '.' + tag + '.01'
                        # Checking if there's a possiblity of lemmatized word be accepted
                        try:
                            this_word_pos = swn.senti_synset(
                                concat).pos_score()
                            this_word_neg = swn.senti_synset(
                                concat).neg_score()
                        except:
                            missing_words.append(word)
                            continue
                pos_total += this_word_pos
                neg_total += this_word_neg
        li_swn_pos.append(pos_total)
        li_swn_neg.append(neg_total)

        if (pos_total != 0 or neg_total != 0):
            if (pos_total > neg_total):
                li_swn.append(1)
            else:
                li_swn.append(-1)
        else:
            li_swn.append(0)
    df_copy.insert(4, "pos_score", li_swn_pos, True)
    df_copy.insert(5, "neg_score", li_swn_neg, True)
    df_copy.insert(6, "sent_score", li_swn, True)
    return df_copy

コード例 #23

0

ファイルを表示

ファイル: w2v_wn_links.py プロジェクト: Lou1sM/video_annotation

    try:
        stopwords = nltk.corpus.stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')
        stopwords = nltk.corpus.stopwords.words('english')
    try:
        linker = WN_Linker(w, stopwords)
    except LookupError:
        nltk.download('averaged_perceptron_tagger')
        linker = WN_Linker(w, stopwords)
    try:
        nltk.word_tokenize('cat')
    except LookupError:
        nltk.download('punkt')
    try:
        nltk.map_tag('en-ptb', 'universal', 'NNP')
    except LookupError:
        nltk.download('universal_tagset')

    for dset in ['MSVD', 'MSRVTT']:
        json_fn = f'{dset}_parsed_captions.json'
        with open(json_fn) as f:
            d = json.load(f)
        new_dps = []
        for vidid, dp in d.items():
            atoms_with_synsets = linker.get_synsets_of_rule_parse(
                dp, convert=False)
            # Discard atoms that have a component that hasn't been linked to WN
            new_dp = dict(
                dp, **{
                    'atoms_with_synsets': [

コード例 #24

0

ファイルを表示

ファイル: CreateGrammar.py プロジェクト: helderm/shalk

import nltk
import numpy as np
from nltk.corpus import gutenberg
import pickle


def save_object(obj, filename):
    with open(filename, "w") as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)


sents = gutenberg.sents("blake-poems.txt")

table = []
for i in range(20):
    table.append([])
for s in sents[1:]:
    # TODO prevent ?!, => . and dont count . in a sentence to length
    if len(s) > 2 and len(s) < 20:
        tags = nltk.pos_tag(s)
        simpleTags = [(word, nltk.map_tag("en-ptb", "universal", tag)) for word, tag in tags]
        tagsOnly = [t[1] for t in simpleTags]
        # this is to filter out headlines
        if tagsOnly[len(tagsOnly) - 1] == ".":
            wordCount = len(tagsOnly) - tagsOnly.count(".")
            table[wordCount].append(tagsOnly)

save_object(table, "grammar")

コード例 #25

0

ファイルを表示

ファイル: w2v_wn_links.py プロジェクト: Lou1sM/video_annotation

 def convert_to_uni_tag(self, token):
     return '_'.join(
         [token[0], nltk.map_tag('en-ptb', 'universal', token[1])])

コード例 #26

0

ファイルを表示

ファイル: pos.py プロジェクト: zishan74750/chatbot_ner

 def tag(self, tokens, tagset=None):
     tagged_tokens = APTaggerUtils.tagger.tag(tokens)
     if tagset:
         tagged_tokens = [(token, nltk.map_tag('en-ptb', tagset, tag)) for (token, tag) in tagged_tokens]
     return tagged_tokens

コード例 #27

0

ファイルを表示

ファイル: ArcticFeatureExtractor.py プロジェクト: sosuperic/speakerboxxx

 def pos_tag_simplified(self, tokenized):
     tagged = self.pos_tag(tokenized)
     simplified = [(word, nltk.map_tag('en-ptb', 'universal', tag))
                   for word, tag in tagged]
     return simplified

コード例 #28

0

ファイルを表示

def convert_tag_to_universal(tag):
    return nltk.map_tag('en_ptb', 'universal', tag)