Ejemplo n.º 1
0
def pos_titles_from(input_path, output_path = None, options = None):
    finput, foutput = get_streams(input_path, output_path)
    skip, end = get_options(options)
    tokenizer = Tokenizer()
    tagger = PerceptronTagger()
    line_counter = 0
    skipped_lines = 0
    for line in finput:
        log_advance(1000000, line_counter)
        line_counter += 1
        if line_counter <= skip:
            continue
        if end and line_counter > end:
            break
        try:
            paper_id, title = get_fields(line)
            if is_english(title):
                print >> foutput, paper_id
                tokens = tokenizer.tokenize(title)
                for token in tagger.tag(tokens):
                    print >> foutput, token[0], token[1]
                print >> foutput
            else:
                skipped_lines += 1
        except:
            print >> sys.stderr, "Error:", line, sys.exc_info()
    log_nlines(line_counter, skipped_lines)
Ejemplo n.º 2
0
    def find_ml(self, td):
        f_tokenizer = TreebankWordTokenizer()
        query_words = f_tokenizer.tokenize(td)
        genres = self.sentiment_analysis(query_words)
        weighted_genres = []
        genre_weights = {}
        for x in genres:
            if x[1] is not None:
                weighted_genres.append(x[0])
                genre_weights[x[0]] = x[1]

        d_score_updates = {}
        for movie in self.movies:
            g = self.genre_dict[movie][0]
            total_genre_score = 0
            if u'Comedy' in g and 'comedy' in weighted_genres:
                total_genre_score += genre_weights['comedy']
            if u'Action' in g and 'action' in weighted_genres:
                total_genre_score += genre_weights['action']
            if u'Crime' in g and 'crime' in weighted_genres:
                total_genre_score += genre_weights['crime']
            if u'Drama' in g and 'drana' in weighted_genres:
                total_genre_score += genre_weights['drama']
            d_score_updates[self.movies.index(movie)] = total_genre_score * .1

        return d_score_updates
Ejemplo n.º 3
0
def transformTweetData(tweet):
    content = unicode(tweet.sentence.lower(), errors='ignore')
    words = content.strip().split()
    tokenizer = TreebankWordTokenizer()
    extra_features = []
    content = " ".join(words + extra_features)
    tokens = tokenizer.tokenize(content)
    tokens = [t for t in tokens if t not in stopwords]
    return tokens
Ejemplo n.º 4
0
def tokenize_en(text):
    """
    Return a list of lists of the tokens in text, separated by sentences.
    """
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    tokenizer = TreebankWordTokenizer()
    sentences = [tokenizer.tokenize(sentence) 
                 for sentence in sent_tokenizer.tokenize(text)]
    return sentences
Ejemplo n.º 5
0
def pos_per_line(text_file):
    try:
        tokenizer = Tokenizer()
        #pos
        tagger = PerceptronTagger()
        for s in text_file:
            tokens = tokenizer.tokenize(s)
            #print " ".join([" ".join(token)  for token in tagger.tag(tokens)])
            print " ".join([token[1]  for token in tagger.tag(tokens)])
    except:
        print >> sys.stderr, "Error pos_per_line(text_file): ", sys.exc_info()
Ejemplo n.º 6
0
	def getNoun(self, parser, sentence):
		#mysent = sentence.encode('ascii','ignore')
		#sent = mysent.decode()
		penn = TreebankWordTokenizer()
		tags = parser.tag(penn.tokenize(sentence))
		the_tags = []
		nouns = []
		for t in tags:
			if t[1].startswith('NN'):
				nouns.append(t[0])
		return ' '.join(nouns)
Ejemplo n.º 7
0
def genLexicon(data):

	tok = TreebankWordTokenizer()

	texts = []
	for doc in data:
		for sent in doc:
			texts.append(tok.tokenize( sent[1].lower() ))

	dictionary = corpora.Dictionary(texts)

	pickle.dump(dictionary, open("lex/toy.lex", "w"))
def crear_dicc_doc_term(path):
    result = []
    result_aux = []
    file = open(path)
    for f in file:
        result.append(f)
    tokenizer = TreebankWordTokenizer()
    for s in result:
        tokenizer = RegexpTokenizer("[\w']+")
        temp = tokenizer.tokenize(s)
        words = temp
        result_aux += eiminar_stopwords(words)
    return result_aux
Ejemplo n.º 9
0
def section_02_02( datDIR ):

    print("\n### ~~~~~ Section 02.02 ~~~~~~~~");

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    textfile = os.path.join( datDIR , "the-great-gatsby.txt" )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    with open(file = textfile, mode = 'r') as inF:
        sentences = []
        for i, tempLine in enumerate(inF):
            if i > 100:
                break
            tempLine = tempLine.strip()
            sentences.append(tempLine)
            print( "%5d: %s" % (i,tempLine) )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    mySentence = sentences[20] + " " + sentences[21]
    print("\nmySentence:")
    print(   mySentence  )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    #tokens = mySentence.split("([-\s.,;!?])+")
    tokens = re.split("([-\s.,;!?])+",mySentence)
    temp = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None,tokens))
    print("\ntemp")
    print(   temp )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myPattern = re.compile("([-\s.,;!?])+")
    tokens = myPattern.split(mySentence)
    print("\ntokens[-10:]")
    print(   tokens[-10:] )

    temp = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None,tokens))
    print("\ntemp")
    print(   temp )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myRegexpTokenizer = RegexpTokenizer("\w+|$[0-9.]+|\S+")
    print("\nmyRegexpTokenizer.tokenize(mySentence):")
    print(   myRegexpTokenizer.tokenize(mySentence)  )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myTreebankWordTokenizer = TreebankWordTokenizer()
    print("\nmyTreebankWordTokenizer.tokenize(mySentence):")
    print(   myTreebankWordTokenizer.tokenize(mySentence)  )
    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
Ejemplo n.º 10
0
 def word_tokenizePT(self,  text, tokenizer):
     """ tokenize a portuguese sentence in words
     @input params: sentence - a sentence, a phrase (self)
                    tokenizer - "TB" for TreebankWordTokenizer
                                "WP" for WordPunctTokenizer
     @returns word's list or error """
     if tokenizer == "TB":
         tokenizerTB = TreebankWordTokenizer()
         return tokenizerTB.tokenize(text)
     elif tokenizer == "WP":
         tokenizerWP = WordPunctTokenizer()
         return tokenizerWP.tokenize(text)
     else:
         return "tokenizer error: not found" 
Ejemplo n.º 11
0
    def __init__(self):
        self.tokenizer = TreebankWordTokenizer()

        self.word_pattern = re.compile(r"^([\w.]*)(\.)(\w*)$")
        self.proper_noun = re.compile(r"([A-Z]\.){2,}$")

        f = open(get_wpath("transition_words"), "r", encoding="utf8")
        transition_word = f.readline()
        self.words = r"([.,!?;:])\ *" + transition_word
        f.close()

        training_sents = nltk.corpus.treebank_raw.sents()
        tokens = []
        boundaries = set()
        offset = 0
        for sent in training_sents:
            tokens.extend(sent)
            offset += len(sent)
            boundaries.add(offset-1)

        # Create training features
        featuresets = [(self.punct_features(tokens, i), (i in boundaries))
                       for i in range(1, len(tokens)-1)
                       if tokens[i] in '.?!']

        train_set = featuresets
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
Ejemplo n.º 12
0
def get_data():
    glove = get_glove()
    tokenizer = TreebankWordTokenizer().tokenize
    text_field = Field(sequential=True,
                       tokenize=tokenizer,
                       include_lengths=True,
                       lower=True,
                       use_vocab=True)
    label_field = Field(sequential=False,
                        pad_token=None,
                        unk_token=None,
                        is_target=True,
                        use_vocab=True)
    with Timer('snli') as timer:
        print('snli{')
        splits = get_snli(text_field, label_field)
        print('}')

    text_field.build_vocab(*splits, vectors=glove)
    label_field.build_vocab(*splits)
    text_vocab = text_field.vocab
    label_vocab = label_field.vocab

    text_embeds = get_embeds(text_vocab.vectors)
    # snli = [pick_samples(ds, n=100) for ds in splits]  # TODO: comment
    snli = splits

    return (snli, text_field, label_vocab, text_embeds)
Ejemplo n.º 13
0
    def predict_with_parser(cls, options):
        if options.input_format == "standard":
            data_test = cls.DataType.from_file(options.conll_test, False)
        elif options.input_format == "space":
            with smart_open(options.conll_test) as f:
                data_test = [cls.DataType.from_words_and_postags([(word, "X") for word in line.strip().split(" ")])
                             for line in f]
        elif options.input_format == "english":
            from nltk import download, sent_tokenize
            from nltk.tokenize import TreebankWordTokenizer
            download("punkt")
            with smart_open(options.conll_test) as f:
                raw_sents = sent_tokenize(f.read().strip())
                tokenized_sents = TreebankWordTokenizer().tokenize_sents(raw_sents)
                data_test = [cls.DataType.from_words_and_postags([(token, "X") for token in sent])
                             for sent in tokenized_sents]
        elif options.input_format == "tokenlist":
            with smart_open(options.conll_test) as f:
                items = eval(f.read())
            data_test = cls.DataType.from_words_and_postags(items)
        else:
            raise ValueError("invalid format option")

        logger.info('Initializing...')
        parser = cls.load(options.model, options)

        ts = time.time()
        cls.predict_and_output(parser, options, data_test, options.out_file)
        te = time.time()
        logger.info('Finished predicting and writing test. %.2f seconds.', te - ts)
class TreebankWordTokenizerWrapper:
  """ Seriously I don't know why we need this class - this makes no sense """

  PAT_NLTK_BUG = re.compile(r"^(?:(.+)(,|'s))$")

  def __init__(self):
    self.word_tokenizer = TreebankWordTokenizer()

  def tokenize(self, s):
    temp = self.word_tokenizer.tokenize(s)
    if temp:
      it = []
      for t0 in temp:
        t = [t0]
        while True:
          m = self.PAT_NLTK_BUG.search(t[0])
          if m:
            t.insert(0, m.group(1))
            t[1] = m.group(2)
          else:
            break
        it += t
        #sys.stderr.write('DEBUG: t=%s => %s\n' % (t0, t))
    else:
      it = temp
    return it
Ejemplo n.º 15
0
class CRCleaner(Cleaner):
    def __init__(self, input_dir, output_dir):
        super(CRCleaner,self).__init__(input_dir, output_dir, u"-\n'", punctuation+digits)
        self.t = TreebankWordTokenizer()
    
    def cleaned_text(self, text):
        if len(text) == 0:
            return u""
        sans_xml = self.xml_to_txt(text)
        arr = self.t.tokenize(sans_xml)
        return self.reconstruct_arr(arr)
    
    def xml_to_txt(self, xml):
        arr = []
        dom = parseString(xml)
        for node in (dom.firstChild.getElementsByTagName('speaking')+dom.firstChild.getElementsByTagName('speaking-unknown-id')):
            paragraphs = node.getElementsByTagName('paragraph')
            if len(paragraphs) > 0:
                for node2 in paragraphs:
                    if node2.hasChildNodes():
                        child = node2.firstChild
                        if child.nodeType == child.TEXT_NODE:
                            arr += [child.data.replace('&nbsp;',' ')]
        return ' '.join(arr)
    
    def new_filename(self, old_filename):
        return old_filename.replace('.xml', '.txt')
Ejemplo n.º 16
0
def tf_normalized(full_texts):
    tokenizer = Tokenizer()
    tf = {}
    max_value = 0
    for text in full_texts:
        text_tokens = tokenizer.tokenize(text)
        text_tokens = escape_not_abbreviations(text_tokens)
        for token in text_tokens:
            token = token.lower()
            tf.setdefault(token, 0.0)
            tf[token] += 1.0
            if tf[token] > max_value:
                max_value = tf[token]
    for t in tf:
        tf[t] = tf[t]/max_value
    return tf
def compute_similarity(j, query, tf, idf, doc_norm, review_idx_mapping,
                       neighborhood):
    """Calculates similarity score bewteen query and each review. Returns a list of review objects with
    similarity score attached"""
    if query == "":
        new_reviews = []
        for review in j["reviews"]:
            new_review = review
            new_review["sim_score"] = 1
            new_reviews.append(new_review)
        return new_reviews

    tokenizer = TreebankWordTokenizer()
    doc_scores = np.zeros(len(doc_norm))  # Initialize D

    query = query.lower()
    tokenized_query = tokenizer.tokenize(query)
    counter = Counter(tokenized_query)
    counter = {
        token: count
        for (token, count) in counter.items() if token in idf
    }
    query_token_to_idx = {
        token: idx
        for idx, (token, _) in enumerate(counter.items())
    }

    for token, count in counter.items():
        cur_token_idx = query_token_to_idx[token]
        q_tfidf = count * idf[token]  # Construct q

        for doc_id, freq in tf[token]:
            doc_scores[doc_id] += q_tfidf * freq * idf[token]  # Construct D

    for idx in range(len(doc_norm)):
        doc_scores[idx] = doc_scores[idx] / (doc_norm[idx] + 1)

    neighborhood = neighborhood.lower()

    output = [(review_idx_mapping[neighborhood][i], doc_scores[i])
              for i in range(len(doc_scores))]
    new_reviews = []
    for idx, score in output:
        new_review = j["reviews"][idx]
        new_review["sim_score"] = score
        new_reviews.append(new_review)
    return new_reviews
Ejemplo n.º 18
0
 def filtrer1(ennonce):
     from nltk.tokenize import TreebankWordTokenizer
     from nltk.corpus import stopwords
     # On instancie notre tokenizer
     tokenizer = TreebankWordTokenizer()
     tokens = tokenizer.tokenize(ennonce)
     # chargement des stopwords français
     french_stopwords = set(stopwords.words('french'))
     # un petit filtre
     tokens = [
         token for token in tokens
         if token.lower() not in french_stopwords
     ]
     filtrat = []
     for element in tokens:
         filtrat.append(element)
     return (filtrat)
Ejemplo n.º 19
0
def q04_count_vectors(path,ranges=(1,2),max_df=0.5,min_df=2):
    data,X_train,X_test,y_train,y_test=q01_load_data(path)
    tokenizer1=TreebankWordTokenizer()
    tf=CountVectorizer(decode_error='ignore',tokenizer=tokenizer1.tokenize,ngram_range=ranges,max_df=max_df, min_df=min_df,stop_words='english')
    tf.fit(X_train)
    variable1=tf.transform(X_train)
    variable2=tf.transform(X_test)
    return variable1,variable2
Ejemplo n.º 20
0
	def pennTreeBank(self, text):		
		tokenizedText = []

		for s in text:
			s=s.lower()
			tokenizedText.append(TreebankWordTokenizer().tokenize(s))

		return tokenizedText
Ejemplo n.º 21
0
 def __init__(self, filename):
     self.filename = filename
     self.tokenizer = TreebankWordTokenizer()
     self.sent_tokenizer = load(
         'tokenizers/punkt/{0}.pickle'.format('english'))
     self.st = StanfordPOSTagger(
         '../stanfordPOStagger/english-bidirectional-distsim.tagger',
         '../stanfordPOStagger/stanford-postagger.jar',
         java_options='-mx2048m')
     #self.w2v_model = KeyedVectors.load_word2vec_format(
     #    "C:/Users/PC1/Desktop/python/деплом/deplom/constructions/GoogleNews-vectors-negative300.bin.gz",
     #    binary=True)
     self.w2v_model = None
     self.text = self.get_text()
     self.anns = []
     self.idx_list = IdxList()
     self.punct = punctuation + '‘’— \t\n'
def get_if(example):
    q_toks = example.q2_toks # lower after tokenising as case info is useful
    for marker in CONDITIONAL_MARKERS:
        marker_toks = TreebankWordTokenizer().tokenize(marker)
        
        if find_sublist(marker_toks, q_toks) > 0:
            return True
    return False
def sep_cue(example):
    q_toks = example.q2_toks # lower after tokenising as case info is useful
    for marker in SEPARABLE_MARKERS:
        marker_toks = TreebankWordTokenizer().tokenize(marker)
        
        if find_sublist(marker_toks, q_toks) > 0:
            return True
    return False
Ejemplo n.º 24
0
 def __init__(
         self,
         word_tokenizer=TreebankWordTokenizer(),
         sent_tokenizer=LazyLoader('tokenizers/punkt/PY3/turkish.pickle'),
         **kwargs):
     self._seq = MongoDBLazySequence(**kwargs)
     self._word_tokenize = word_tokenizer.tokenize
     self._sent_tokenize = sent_tokenizer.tokenize
Ejemplo n.º 25
0
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])

            except KeyError:
                pass  # No matching token in the Google w2v vocab

        vectorized_data.append(sample_vecs)

    return vectorized_data
Ejemplo n.º 26
0
    def _compute_unigram_frequency(self):
        wordlists = PlaintextCorpusReader(self.prepared_training_data_root, '.*')
        tokenizer = TreebankWordTokenizer()
        total = len(wordlists.fileids())
        count = 0
        fdist = nltk.FreqDist()
        for fl in wordlists.fileids():
            count += 1
            fl_abs_path = os.path.join(self.prepared_training_data_root, fl)
            with open(fl_abs_path, 'r') as f:
                words = tokenizer.tokenize(f.read())
                fdist.update(words)
            print 'freqdist: %s of %s' % (count, total)

        with open(os.path.join(self.corpus_root, 'unigram_frequency.txt'), 'w') as f:
            f.writelines(['%s %s\n' % (word, freq) for (word, freq) in fdist.items()])
        return None
Ejemplo n.º 27
0
def lemma_tokenizer(text):
    stop_words = stopwords.words("english")
    tokens = TreebankWordTokenizer().tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    filtered_words = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(word) for word in filtered_words]
    return lemmas
def tokeniseForDistance(sentence):
    """ Function to return tokens from a sentence       
    """
    punc = list(string.punctuation)
    tokens = TreebankWordTokenizer().tokenize(sentence)
    #tokens = [token for token in tokens if token not in punc]

    return tokens
Ejemplo n.º 29
0
    def tokenize(self, list_text, tokenizer=None):
        if not list_text:
            return None
        if not isinstance(list_text, list):
            raise ValueError("Please input a list of string for tokenization!")
        self.list_text = list_text
        if not tokenizer:
            self.raw_tokens = [text.split() for text in list_text]
        elif "treebank" in tokenizer.lower():
            t = TreebankWordTokenizer()
            self.raw_tokens = [t.tokenize(text) for text in list_text]
        elif "toktok" in tokenizer.lower():
            t = ToktokTokenizer()
            self.raw_tokens = [t.tokenize(text) for text in list_text]

        if not self.raw_tokens:
            return None
def polar_q1(example):
    q_toks = example.q1_toks # lower after tokenising as case info is useful

    for marker in POLAR_MARKERS:
        marker_toks = TreebankWordTokenizer().tokenize(marker)
        if find_sublist(marker_toks, q_toks) > 0:
            return True
    return False
Ejemplo n.º 31
0
def ArTokenizer(text, token_min_len=2, token_max_len=15, lower=False):
    tokens = TreebankWordTokenizer().tokenize(
        accents.sub('', puncs.sub(' ', text)))
    # keep only Ar words between min/max len and remove other characters if any
    return [
        nonArabic.sub('', token) for token in tokens if arabic.findall(token)
        and token_min_len <= len(token) <= token_max_len
    ]
def vp_ell_q2(example):
    q_toks = example.q2_toks # lower after tokenising as case info is useful
    for marker in VERB_ELLIPSIS_MARKERS:
        marker_toks = TreebankWordTokenizer().tokenize(marker)
        
        if find_sublist(marker_toks, q_toks) > 0:
            return True
    return False
Ejemplo n.º 33
0
 def run(self):
     for i in range(int(self.lo), int(self.hi)):
         data = urlopen(str(url[i]))
         mybytes = data.read().decode('windows-1252').lower()
         tokenizer = TreebankWordTokenizer()
         line = re.sub(
             '[i?.,\',;:/\"<>\\%@#+-_&^$=()…—“”’*»’.``!¿\'`"’ï–]', '',
             mybytes)
         arrayWord = tokenizer.tokenize(line)
         for j in range(len(arrayWord)):
             self.binary.put(arrayWord[j], 1, i)
             w = self.hashTable.find(arrayWord[j])
             if (w != None):
                 self.hashTable.insert(arrayWord[j], w + 1, i)
             else:
                 self.hashTable.insert(arrayWord[j], 1, i)
                 self.sequence.append(Data(i + 1, arrayWord[j], j))
Ejemplo n.º 34
0
 def __init__(self):
     self.tokenizer = TreebankWordTokenizer()
     # remove % and @ from the4th list as compared to original PUNCTUATION:
     self.tokenizer.PUNCTUATION = [
         (re.compile(r'([:,])([^\d])'), r' \1 \2'),
         # ABN: added to handle non-pronunceable dashes, like Súes-skurðinn'
         # keep dashes after digits and ordinals, and SNAV (directions). Add 'a-ö'?
         (re.compile(r'([^\.\d[A-ZÞÆÖÁÉÍÓÚÝÐ])([-])'), r'\1 '),
         (re.compile(r'([:,])$'), r' \1 '),
         (re.compile(r'\.\.\.'), r' ... '),
         (re.compile(r'[;#$&]'), r' \g<0> '),
         # Handles the final period.
         # #ABN: changed this to deal with ordinals at the end of sentence: [^\.] -> [^\.\d], don't detach '.' after a digit. (Might be too general)
         (re.compile(r'([^\.\d])(\.)([\]\)}>"\']*)\s*$'), r'\1 \2\3 '),
         (re.compile(r'[?!]'), r' \g<0> '),
         (re.compile(r"([^'])' "), r"\1 ' "),
     ]
def wh_q2(example):
    q_toks = example.q2_toks # lower after tokenising as case info is useful
    for marker in WH_WORDS:
        marker_toks = TreebankWordTokenizer().tokenize(marker)
        
        if find_sublist(marker_toks, q_toks) > 0:
            return True
    return False
Ejemplo n.º 36
0
def filtrer_ennonce(ennonce):
    from nltk.tokenize import TreebankWordTokenizer
    from nltk.corpus import stopwords
    # On instancie notre tokenizer
    tokenizer = TreebankWordTokenizer()

    tokens = tokenizer.tokenize(ennonce)

    # chargement des stopwords français
    french_stopwords = set(stopwords.words('french'))

    # un petit filtre
    tokens = [
        token for token in tokens if token.lower() not in french_stopwords
    ]

    print(tokens)
Ejemplo n.º 37
0
    def __prepare__(self):
        """
        
        """
        conversations = open(path.join(self.BASE_PATH, self.CONVS_FILE),
                             'r').readlines()
        movie_lines = open(path.join(self.BASE_PATH, self.LINES_FILE),
                           'r').readlines()
        tbt = TreebankWordTokenizer().tokenize
        self.words_set = set()
        self.lines_dict = {}
        for i, line in enumerate(movie_lines):
            parts = map(lambda x: x.strip(), line.lower().split(self.FILE_SEP))
            tokens = tbt(parts[-1])
            self.lines_dict[parts[0]] = tokens
            self.words_set |= set(tokens)
        self.word2idx = {}
        self.word2idx[self.PAD_TOKEN] = 0
        self.word2idx[self.EOS_TOKEN] = 1
        self.word2idx[self.GO_TOKEN] = 2
        for i, word in enumerate(self.words_set):
            self.word2idx[word] = i + 3
        self.idx2word = [0] * len(self.word2idx)
        for w, i in self.word2idx.items():
            self.idx2word[i] = w

        # extract pairs of lines in a conversation (s0, s1, s2) -> {(s0, s1), (s1, s2)}
        utt_pairs = []
        for line in conversations:
            parts = map(
                lambda x: x[1:-1],
                map(lambda x: x.strip(),
                    line.lower().split(self.FILE_SEP))[-1][1:-1].split(', '))
            utt_pairs += list(pairwise(parts))
        utt_pairs = np.random.permutation(utt_pairs)
        train_utt_pairs = utt_pairs[self.VAL_COUNT:]
        self.val_pairs = utt_pairs[:self.VAL_COUNT]

        def find_bucket(enc_size, dec_size, buckets):
            return next(
                dropwhile(lambda x: enc_size > x[0] or dec_size > x[1],
                          buckets), None)

        for pair in train_utt_pairs:
            bckt = find_bucket(len(self.lines_dict[pair[0]]),
                               len(self.lines_dict[pair[1]]),
                               self.bucket_sizes)
            if bckt is None:
                self.bucket_pairs[(-1, -1)].append(pair)
            else:
                self.bucket_pairs[bckt].append(pair)

        self.bucket_ordering = []
        for bckt, _ in sorted(map(lambda x: (x[0], len(x[1])),
                                  self.bucket_pairs.items()),
                              key=lambda x: x[1],
                              reverse=True):
            self.bucket_ordering.append(bckt)
Ejemplo n.º 38
0
 def _compute_biagram_frequency(self):
     if not os.path.exists(self.bigram_frequency_dir):
         os.mkdir(self.bigram_frequency_dir)
     wordlists = PlaintextCorpusReader(self.prepared_training_data_root, '.*')
     tokenizer = TreebankWordTokenizer()
     total = len(wordlists.fileids())
     count = 0
     for fl in wordlists.fileids():
         count += 1
         print 'freqdist: %s of %s' % (count, total)
         fl_abs_path = os.path.join(self.prepared_training_data_root, fl)
         with open(fl_abs_path, 'r') as f:
             words = tokenizer.tokenize(f.read())
             bi_words = nltk.bigrams(words)
             fdist = nltk.FreqDist(bi_words)
         with open(os.path.join(self.bigram_frequency_dir, fl), 'w') as f:
             f.writelines(['%s %s %s\n' % (word[0], word[1], freq) for (word, freq) in fdist.items()])
     return None
Ejemplo n.º 39
0
    def testTreebankTokenizer(self):
        tokenizer = IndexedTokenizer(TreebankWordTokenizer())
        string = " Facing the long wall in front of you, your destination will be the first door to your left (36-880)."
        tokens = tokenizer.tokenize(string)
        self.assertEqual([t.text for t in tokens],
                         ['Facing', 'the', 'long', 'wall', 'in', 'front', 'of', 'you', ',', 'your', 'destination', 'will', 'be', 'the', 'first', 'door', 'to', 'your', 'left', '(', '36-880', ')', '.'])

        for i, token in enumerate(tokens):
            self.assertEqual(string[tokens[i].start:tokens[i].end], token.text)
def eiminar_stopwords(words):
    a = open('english.txt')
    result = []
    english_stops = []
    for f in a:
        result.append(f)
    tokenizer = TreebankWordTokenizer()
    for s in result:
        tokenizer = RegexpTokenizer("[\w']+")
        temp = tokenizer.tokenize(s)
        english_stops += temp
    resultado = []
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()
    for w in words:
        if not w in english_stops:
            resultado.append(stemmer.stem(w))
    return resultado
def read_data(filename):

    with open(filename, encoding='utf-8') as f:
        data = tf.compat.as_str(f.read())
        data = data.lower()
        data = text_parse(data)
        data = TreebankWordTokenizer().tokenize(data)  # The Penn Treebank

    return data
Ejemplo n.º 42
0
def stopWords (chaine) :
	tokenizer = TreebankWordTokenizer()
	tokens = tokenizer.tokenize(chaine)
	# chargement des stopwords français
	french_stopwords = set(stopwords.words('french'))
	# un petit filtre
	tokens = [token for token in tokens if token.lower() not in french_stopwords]
	counts = Counter(tokens)
	counts=counts.most_common(50)
	dico={}
	tabDico=[]
	for i in range(0,len(counts)):
		dico['text'] = counts[i][0]
		dico['size'] = counts[i][1]
		dico['href'] = "onclick/"+counts[i][0]
		tabDico.append(dico)
		dico={}
	return tabDico
Ejemplo n.º 43
0
class TreeBankWordTokenizerWrapper(AbstractTokenizer):
    def __init__(self, do_lower_case: bool = False):
        self._tokenizer = TreebankWordTokenizer()
        self._do_lower_case = do_lower_case

    def tokenize_single(self, sentence: str):
        if self._do_lower_case:
            sentence = sentence.lower()
        return self._tokenizer.tokenize(sentence)
Ejemplo n.º 44
0
def sentiment_predict(new_sentence):
    new_sentence = TreebankWordTokenizer().tokenize(new_sentence)  # 토큰화
    #new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
    encoded = Tokenizer.texts_to_sequences([new_sentence])  # 정수 인코딩
    print(encoded)
    pad_new = pad_sequences(encoded, maxlen=42)  # 패딩
    #print(pad_new)
    score = float(loaded_model.predict(pad_new))  # 예측
    return score
Ejemplo n.º 45
0
    def __init__(self, data_path):
        train_path = os.path.join(data_path, "train.txt")
        valid_path = os.path.join(data_path, "valid.txt")
        test_path = os.path.join(data_path, "test.txt")
        vocab_path = os.path.join(data_path, "vocab.pkl")

        self.tokenizer = TreebankWordTokenizer()

        if os.path.exists(vocab_path):
            self._load(vocab_path, train_path, valid_path, test_path)
        else:
            self._build_vocab(train_path, vocab_path)
            self.train_data = self._file_to_data(train_path)
            self.valid_data = self._file_to_data(valid_path)
            self.test_data = self._file_to_data(test_path)

        self.idx2word = {v: k for k, v in self.vocab.items()}
        self.vocab_size = len(self.vocab)
Ejemplo n.º 46
0
def text_fdist(text, min_occurence):
    from nltk.probability import FreqDist
    from nltk.tokenize import TreebankWordTokenizer

    tokenizer = TreebankWordTokenizer()

    #tokenise words:
    tokens = tokenizer.tokenize(text)
    #remove stopwords
    tokens = [
        token.lower() for token in tokens if token.lower() not in stopwords_en
    ]

    fdist_in = FreqDist(tokens)

    #filter words with more than one occurence
    fdist = list(filter(lambda x: x[1] >= min_occurence, fdist_in.items()))
    return fdist
Ejemplo n.º 47
0
def tokenize_for_lda(article,
                     tokenizer=TreebankWordTokenizer(),
                     stopwords=stopwords,
                     regex_pattern=nonword):
    article_tokens = [
        tok for tok in tokenizer.tokenize(article)
        if (tok.lower() not in stopwords and not regex_pattern.search(tok))
    ]
    return article_tokens
Ejemplo n.º 48
0
class DssgUnigramExtractor(object):

    """
    An instance of this is used to obtain a list of unigrams, given a text.
    Usages:
    unigramExtractor = DssgUnigramExtractor()
    tokenList = unigramExtractor.extract("here is a text as a string") # ['text', 'string']
    """

    _cache = {}

    def __init__(self):
        self._tokenizer = TreebankWordTokenizer()
        self._stopwordSet = set(stopwords.words("english"))
        self._stemmer = PorterStemmer()

    def __repr__(self):
        return self.__class__.__name__ + "()"

    def extract(self, text):
        """
        Given a text, return a list of unigram tokens.
        """
        if text not in DssgUnigramExtractor._cache:
            text = (
                text.replace("&lt;", "<")
                .replace("&gt;", ">")
                .replace("&quot;", '"')
                .replace("&amp;", "&")
                .replace("&nbsp;", " ")
            )
            text = nltk.clean_html(text)
            tokens = self._tokenizer.tokenize(text)

            newTokens = []
            for tok in tokens:
                # - lowercase, remove '
                tok = tok.lower().strip("`'.,-_*/:;\\!@#$%^&*()=\"")

                # - remove stopwords, one character word, only numbers
                # - remove one character word
                # - remove only numbers
                if tok in self._stopwordSet or len(tok) <= 1 or isAllNumbers(tok):
                    continue

                # - apply stemming
                # oldTok = copy.deepcopy(tok); # for debug
                tok = self._stemmer.stem(tok)
                # sometimes a token is like 'theres' and becomes stopword after
                # stemming
                if tok in self._stopwordSet:
                    continue

                newTokens.append(tok)
            DssgUnigramExtractor._cache[text] = newTokens
        return DssgUnigramExtractor._cache[text]
Ejemplo n.º 49
0
def tokenize(text, stopword=False, punct=False, lower=False,
             stem=False, num=False, single=False, link=False):
    """
    num: True, exclude numbers
    single: True, exclude single char
    todo: deal with unicode mafuckers
    """
    token = []
    tokenizer = TreebankWordTokenizer()
    token_temp = tokenizer.tokenize(text)
    for elt in token_temp:
        #temp = i.decode('unicode-escape')
        #temp = re.sub(ur'[\xc2-\xf4][\x80-\xbf]+',
        #             lambda m: m.group(0).encode('latin1').decode('utf8'), temp)
        temp = unicode(elt)
        temp = unicodedata.normalize('NFKD', temp).encode('ascii', 'ignore')

        # get rid of empty strings
        #temp = i
        if temp:
            token.append(temp)

    token = [clean_front_end(word) for word in token if clean_front_end(word)]

    if lower:
        token = [word.lower() for word in token]
    if stem:
        token = [stemmer.stem(word) for word in token]
    if num:
        token = [word for word in token if not is_number(word)]
    if single:
        token = [word for word in token if len(word) > 1]
    if stopword:
        token = [word for word in token if word not in STOPWORD]
    if punct:
        token = [word for word in token if word not in PUNCT]
    if link:
        token = [word for word in token if not is_link(word)]

    #exclude empty strings
    token = [word for word in token if word]

    return token
Ejemplo n.º 50
0
def stopwords(filename):
    """A function that returns a dictionary with tokens as keys
    and counts of how many times each token appeared as values in
    the file with the given filename.

    Inputs:
        filename - the name of a plaintext file with a document on each line
    Outputs:
        A list of stopwords and a dictionary mapping tokens to counts.
    """
    
    # We now track the number of times a word shows up (term frequency) and
    # the number of documents with a given word in it (document frequency)
    # separately. We use a Counter, which is exactly like a dictionary except
    # - the values can only be ints
    # - any key it hasn't seen yet is assumed to already have a value of 0
    # This means we don't have to check whether we've used a key before when
    # we use the "+= 1" operation.
    term_frequency_dict = Counter()
    word_total = 0
    
    tokenizer = TreebankWordTokenizer()

    with open(filename, 'r') as f:
        for line in f:
            words = tokenizer.tokenize(line.lower())       

            # For the programmer types: there are several more efficient
            # ways to write this section using dictionaries or sets. You're
            # welcome to rewrite this part to exercise that.      
            for word in words:
                term_frequency_dict[word] += 1
                word_total += 1

    # A fun feature of Counters is that they have a built-in function that
    # gives you the n keys with the biggest values, or the "most common"
    # things being counted. We can use this to find the most common words.
    # This comes out as a list of pairs of key and value, like
    # [('foo', 10), ('bar', 7), ... , ('rare', 1)]
    stoplist_pairs = term_frequency_dict.most_common(100)
    stoplist = [word for (word, freq) in stoplist_pairs]
    
    return stoplist, term_frequency_dict, word_total
Ejemplo n.º 51
0
    def sentences(self, lowercase=False, strip_punct=[], num_placeholder=None):
        word_tokenizer=TreebankWordTokenizer()
        sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/english.pickle')
        token_sents = [word_tokenizer.tokenize(sent) for sent in sent_tokenizer.tokenize(self.response)]

        if lowercase:
            token_sents = [[token.lower() for token in sent] for sent in token_sents]

        if len(strip_punct) > 0:
            token_sents = [[token for token in sent if token not in strip_punct] for sent in token_sents]

        if num_placeholder is not None:
            def replace_num(token, placeholder):
                try:
                    float(token.replace(',',''))
                    return placeholder
                except ValueError:
                    return token
                
            token_sents = [[replace_num(token, num_placeholder) for token in sent] for sent in token_sents]
        return token_sents
Ejemplo n.º 52
0
    def test_treebank_span_tokenizer(self):
        """
        Test TreebankWordTokenizer.span_tokenize function
        """

        tokenizer = TreebankWordTokenizer()

        # Test case in the docstring
        test1 = "Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks)."
        expected = [
            (0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)
        ]
        result = tokenizer.span_tokenize(test1)
        self.assertEqual(result, expected)

        # Test case with double quotation
        test2 = "The DUP is similar to the \"religious right\" in the United States and takes a hardline stance on social issues"
        expected = [
            (0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27),
            (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64),
            (65, 68), (69, 74), (75, 76), (77, 85), (86, 92), (93, 95), (96, 102),
            (103, 109)
        ]
        result = tokenizer.span_tokenize(test2)
        self.assertEqual(result, expected)

        # Test case with double qoutation as well as converted quotations
        test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues"
        expected = [
            (0, 3), (4, 7), (8, 10), (11, 18), (19, 21), (22, 25), (26, 27),
            (27, 36), (37, 42), (42, 43), (44, 46), (47, 50), (51, 57), (58, 64),
            (65, 68), (69, 74), (75, 76), (77, 79), (79, 87), (87, 89), (90, 96),
            (97, 99), (100, 106), (107, 113)
        ]
        result = tokenizer.span_tokenize(test3)
        self.assertEqual(result, expected)
Ejemplo n.º 53
0
class MorphyStemmer:
    def __init__(self):
        self.tokenizer = TreebankWordTokenizer()

    def __call__(self, doc):
        stemmed_doc = []
        for t in self.tokenizer.tokenize(doc):
            stem = wordnet.morphy(t)
            if stem:
                stemmed_doc.append(stem.lower())
            else:
                stemmed_doc.append(t.lower())
        return stemmed_doc
Ejemplo n.º 54
0
def make_word_set(context):
    """ Computes the set of all words used in a list of strings.

    Arguments
    =========

    context: a list of strings

    Returns
    =======

    word_set: set of distinct words
    """
    tokenizer = TreebankWordTokenizer()
    sw = stopwords.words('english')
    word_list = []
    for string in context:
        tkns = tokenizer.tokenize(string)
        for tk in tkns:
            if tk not in sw:
                word_list.append(tk)
    word_set = set(word_list)
    return word_set
Ejemplo n.º 55
0
class Tokenizer(object):
    
    def __init__(self, language='english'):
        self.paragraph_tokenizer = nltk.data.load('tokenizers/punkt/%s.pickle' % language)
        self.sentence_tokenizer = TreebankWordTokenizer()
        self.english_stops = set(stopwords.words(language))
        
    def tokenize(self, text, remove_stopwords=False):
        sentences = self.paragraph_tokenizer.tokenize(text)
        token = []
        for sentence in sentences:
            words = self.sentence_tokenizer.tokenize(sentence)
            if remove_stopwords:
                token.append([word for word in words if word not in self.english_stops])
            else:
                token.append(words)
        return token
 def __init__(self, images_path, annotations_path, buckets, bucket_minibatch_sizes, word2idx, mean_im, shuffle=True):
     self.buckets = buckets
     self.word2idx = word2idx
     self.bucket_minibatch_sizes = bucket_minibatch_sizes
     self.buffer_size = 16
     self.input_qsize = 64
     self.min_input_qsize = 16
     self.total_max = 0
     self.mean_im = mean_im
     self.tokenizer = TreebankWordTokenizer()
     self.annotations_path = annotations_path
     self.images_path = images_path
     self.shuffle = shuffle
     self._initialize()
     self.queue = Queue.Queue()
     self.out_queue = Queue.Queue(maxsize=self.buffer_size)
     self._init_queues()
Ejemplo n.º 57
0
class nlp:
    def __init__(self):
        self.tb = tb
        self.porter = nltk.PorterStemmer()
        self.tk = TreebankWordTokenizer()
        self.stopwords = set(stopwords.words())
    def tag(self,text):
        blob = self.tb(text)
        return blob.tags
    #clean是词干化和标点符号的
    def noun(self,text,clean=True):
        text = text.replace('\\n',' ')
        text = text.replace('\\t',' ')
        blob = self.tb(text)
        tags = blob.tags
        result = []
        for (aword,atag) in tags:
            if atag == "NNP" or atag == "NNS" or atag == "NN":
                result.append(aword.lower())

        if clean == True:
            clean_result = []
            for word in result:
                nword = porter.stem(remove_non_chap(word))
                #nword = small_stem(remove_non_chap(word))
                if len(nword) > 2:
                    clean_result.append(nword)
            return clean_result
        return result
        
    #这个东西可能用着不太好,暂时先别用
    def noun_p(self,text):
        blob = self.tb(text)
        return blob.noun_phrases

    def token(self,text):
        result,clean_result = self.tk.tokenize(text),[]
        for word in result:
            nword = word.lower()
            nword = small_stem(nword)
            if len(nword) <= 30:
                clean_result.append(nword)
        return ' '.join(clean_result)
Ejemplo n.º 58
0
	def __init__(self, mysql_con, redis_con, tokenizer = None, morph = None, classifier = None, points = []):
		"""
		Initialization.

		Args:
			mysql_con (PySQLPoolConnection): MySQL connection Object
			redis_con (StrictRedis): RedisDB connection Object
			tokenizer (NLTK.TreebankWordTokenizer): object to split tweets into words
			morph (pymorphy2.MorphAnalyzer): word analyzer - converts words tokens to normalized form. Requires a lot of memory, so it is not created for every event object. 
			classifier (Object): scikit trained classifier to detect real and fake events
			points (list[dict]): raw messages from event detector
		"""
		self.mysql = mysql_con
		self.redis = redis_con

		if morph:
			self.morph = morph
		else:
			self.morph = MorphAnalyzer()
		if tokenizer:
			self.tokenizer = tokenizer
		else:
			self.tokenizer = TreebankWordTokenizer()
		self.word = compile(r'^\w+$', flags = UNICODE | IGNORECASE)
		self.url_re = compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

		self.validity = None
		self.verification = None
		self.cores = {}
		self.classifier = classifier

		if points:
			self.id = str(uuid4())
			self.created = datetime.now()
			self.updated = datetime.now()

			self.messages = { x['id']:x for x in points }
			self.get_messages_data()
			self.media = {}
			self.get_media_data()
			self.event_update()
Ejemplo n.º 59
0
    def __init__(self):
        self.sentim_analyzer = SentimentAnalyzer()
        self.genre_dict = read_file("jsons/movie_genre_quote_dict_2.json")
        context_file = "jsons/final_context.json"
        movie_file = "jsons/final_movies.json"
        quote_file = "jsons/final_quotes.json"
        year_rating_file = "jsons/final_year_rating.json"

        self.context = read_file(context_file)
        self.movies = read_file(movie_file)
        self.quotes = read_file(quote_file)
        self.year_rating_dict = read_file(year_rating_file)

        # Reincode to unicode
        for i in range(len(self.context)):
            self.context[i] = self.context[i].encode("utf-8").decode("utf-8")
            self.movies[i] = self.movies[i].encode("utf-8").decode("utf-8")
            self.quotes[i] = self.quotes[i].encode("utf-8").decode("utf-8")

        self.context, self.quotes, self.movies = quote_pruner(self.context, self.quotes, self.movies)

        self.inverted_index = read_file("jsons/f_inverted_index.json")
        self.idf = read_file("jsons/f_idf.json")

        # Initialize query tokenizer
        self.tokenizer = TreebankWordTokenizer()
        # Compute document norms
        self.norms = compute_doc_norms(self.inverted_index, self.idf, len(self.context))

        word_co_filename = "jsons/word_co.json"
        word_count_filename = "jsons/word_count_dict.json"
        pmi_dict_filename = "jsons/pmi_dict.json"
        # Read files
        self.word_co = read_file(word_co_filename)
        self.word_count_dict = read_file(word_count_filename)
        self.pmi_dict = read_file(pmi_dict_filename)
from csv import writer
from datetime import datetime
from nltk.corpus import stopwords
from nltk.data import load
from nltk.corpus import sentiwordnet
from nltk.corpus import wordnet
from nltk.tag import pos_tag
from nltk.tokenize import TreebankWordTokenizer
import ujson
wordsTokenizer = TreebankWordTokenizer()
stopWords = set(stopwords.words('english'))
sentencesTokenizer = load('tokenizers/punkt/english.pickle')
arquivoClassificados = open('classificados.json')
classificados = ujson.load(arquivoClassificados)
arquivoClassificados.close()
acertos = 0
sentimentos = {}
comeco = datetime.now()
for resposta in classificados:
	texto = resposta['corpo']
	frases = sentencesTokenizer.tokenize(texto)
	palavras = []
	for frase in frases:
		palavrasTemp = wordsTokenizer.tokenize(frase)
		palavras.extend([palavra for palavra in palavrasTemp if palavra not in stopWords])
	posTags = pos_tag(palavras)
	positivo = 0
	negativo = 0
	for palavra, tag in posTags:
		synsets = None
		if tag.startswith('J'):