Python tokenize Examples, gensim.utils.tokenize Python Examples

Example #1

0

Show file

File: utils.py Project: cleder/restsims

def stem_tokenize(doc, deacc=True, lowercase=True, errors="strict", stemmer=None):
    """ Split into words and stem that word if a stemmer is given"""
    if stemmer is None:
        for token in tokenize(doc, lowercase=lowercase, deacc=deacc, errors=errors):
            yield token
    else:
         for token in tokenize(doc, lowercase=lowercase, deacc=deacc, errors=errors):
            yield stemmer.stemWord(token)

Example #2

0

Show file

File: dictionary.py Project: AmitShah/keras-language-modeling

    def convert(self, text):
        from gensim.utils import tokenize
        from numpy import asarray

        if isinstance(text, str):
            docs = [tokenize(text, to_lower=True, deacc=True)]
        else:
            docs = [tokenize(t, to_lower=True, deacc=True) for t in text]

        return [asarray([self(t) for t in doc], dtype='int32') for doc in docs]

Example #3

0

Show file

File: PlainCorpus.py Project: yysherlock/pomelo

 def tokenize(self,content, BytesOrNot=False):
     """
     Tokenize a piece of text.
     Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer
     that 15 characters (not bytes!).
     """
 # https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/corpora/wikicorpus.py#L166
 # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
     if BytesOrNot:
         return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore')
             if 2 <= len(token) <= 15 and not token.startswith('_')] # return a list of bytes of characters
     else: return list(utils.tokenize(content, lower=True, errors='ignore')) # return a list of strings

Example #4

0

Show file

File: sources.py Project: RaRe-Technologies/gensim

 def characters(self, text):
     # for text, we only care about tokens directly within the <p> tag
     if self.path[-1] == 'p':
         tokens = [
             token.encode('utf8') for token in utils.tokenize(text, errors='ignore') if not token.isdigit()
         ]
         self.tokens.extend(tokens)

Example #5

0

Show file

File: model_building.py Project: RajeshThallam/job-fiction

    def tokenize(self, document):
        """
        Break text into sentences and each sentence into a list of single words
        Ignore any token that falls into the stopwords set.
        """
        # use sentence tokenizer sent_tokenize from nltk package
        sentences = sent_tokenize(utils.to_unicode(document.lower()))

        # create stemmer of class SnowballStemmer
        stemmer = SnowballStemmer("english")

        for sentence in sentences:
            words = [word
                   for word in utils.tokenize(
                    self.cleanse_text(sentence)
                   )]

            if self.remove_stopwords:
                words = [ 
                         word for word in words 
                         if word not in self.en_stopwords
                        ]

            if self.stemming:
                words = [stemmer.stem(t) for t in words]

            yield words

Example #6

0

Show file

File: full_model_optimized.py Project: GeorgiaTechDHLab/TOME

def iter_documents(top_directory):
	numFound = 0
	for root,dirs,files in os.walk(top_directory):
		for dir1 in filter(lambda newspaper: newspaper != "TheCharlestonMercury-incomplete" and newspaper != "VincennesCourant" , dirs):
		#for dir1 in filter(lambda newspaper: newspaper == "TheCharlestonMercury-incomplete" or newspaper == "VincennesCourant" , dirs):
			#print(dir1)
			for root2, dirs2,files2 in os.walk(top_directory + "/" + dir1):
				#print(files2)
				for dir2 in dirs2:
					for root3, dirs3, files3 in os.walk(top_directory + "/" + dir1 + "/" +  dir2):
						for file1 in filter(lambda filee: filee.endswith('.txt'),files3):
							#print('hi')
							document = open(os.path.join(root,dir1, dir2,file1)).read()
							newline = str(numFound) + "," + dir1 + "/" + dir2 + "/" + file1 + ","
							mdfile = open("accessible/" + dir1 + "/" +  dir2 + "/" +  file1[:len(file1) - 3] + "md", "r")
							lines = mdfile.readlines()
							for line in lines:
								newline += line.split(", ")[1].strip("\n") + ","
							aFile.write(newline[:len(newline) - 1] + "\n")
							stoplist = set('for a of the and to in'.split())
							resultwords = [word for word in document.split() if word.lower() not in stoplist]
							result = ' '.join(resultwords)
													
							cleanedwords = [re.subn("[^a-zA-Z]+", ' ', word)[0] for word in result if '-' not in word]
							resultfinal = ''.join(cleanedwords)
							words = [word.strip() for word in resultfinal.split()]
							final = ' '.join(words)
							numFound += 1
							yield utils.tokenize(resultfinal, lower=True)

Example #7

0

Show file

File: rolling2.py Project: BinbinBian/USAAR-SemEval-2015

def build_word_vector(n=0, mincount=1):
    sbcs = texeval_corpus.test_subcorpora
    sbc = sbcs[n]
    corpus_name = 'WIKI_'+sbc
    sentences = []
    current_term = ""
    with io.open(corpus_name, 'r', encoding='utf8') as fin:
        for line in fin:
            if '\t' in line:
                current_term = line.strip().split('\t')[1]
            if line.strip().endswith('.'):
                if current_term in line:
                    if ' is a ' in line:
                        line = line.replace(' is a ', ' is_a ')
                    if ' is an ' in line:
                        line = line.replace(' is an ', ' is_a ')
                    
                    # Single tokenize terms.
                    depunct_term = "".join(['_' if ch in string.punctuation or 
                                            ch == ' ' else ch 
                                            for ch in current_term])
                    line = line.replace(current_term, depunct_term).lower()
                    sentences.append(list(tokenize(line)))
    bigram_transformer = Phrases(sentences)
    model = Word2Vec(bigram_transformer[sentences], size=100, window=5, 
                     min_count=mincount, workers=3, iter=100)
    model.save(corpus_name+'.100epochs.phrasal.singletok.min'+str(mincount)+'.deep')

Example #8

0

Show file

File: myCorpus.py Project: sinister6000/cs599_project

def tokenize(s, tokenizer):
    """
    Tokenizes a string. Returns a different list of tokens depending on which tokenizer is used.

    :param s: string to be tokenized
    :type s: str
    :param tokenizer: identifies tokenizer to use
    :type tokenizer: str
    :return: list of tokens
    :rtype: []
    """
    tokens = (twokenize.tokenize(s)
              if tokenizer is 'twokenize'
              else (utils.tokenize(s, lower=True)
                    if tokenizer is 'gensim'
                    else (TweetTokenizer(preserve_case=False)).tokenize(s)))

    # list of symbols that can end sentences. twokenize has found these to not be attached to another token.
    # (safe to remove)
    punct = r'.,!!!!????!:;'

    # NLTK english stopwords
    stoplist = stopwords.words('english')

    result = [tok.lower() for tok in tokens if tok not in punct]
    result = [tok for tok in result if tok not in stoplist]
    return result

Example #9

0

Show file

File: wikicorpus.py Project: abs51295/gensim

def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
    """Tokenize a piece of text from wikipedia.

    Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens.

    Parameters
    ----------
    content : str
        String without markup (see :func:`~gensim.corpora.wikicorpus.filter_wiki`).
    token_min_len : int
        Minimal token length.
    token_max_len : int
        Maximal token length.
    lower : bool
         If True - convert `content` to lower case.

    Returns
    -------
    list of str
        List of tokens from `content`.

    """
    # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
    return [
        utils.to_unicode(token) for token in utils.tokenize(content, lower=lower, errors='ignore')
        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
    ]

Example #10

0

Show file

File: textcleaner.py Project: abs51295/gensim

def tokenize_by_word(text):
    """Tokenize input text. Before tokenizing transforms text to lower case and removes accentuation and acronyms set
    :const:`~gensim.summarization.textcleaner.AB_ACRONYM_LETTERS`.

    Parameters
    ----------
    text : str
        Given text.

    Returns
    -------
    generator
        Generator that yields sequence words of the given text.

    Example
    -------
    >>> from gensim.summarization.textcleaner import tokenize_by_word
    >>> g = tokenize_by_word('Veni. Vedi. Vici.')
    >>> print(next(g))
    veni
    >>> print(next(g))
    vedi
    >>> print(next(g))
    vici

    """
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    return tokenize(text_without_acronyms, to_lower=True, deacc=True)

Example #11

0

Show file

File: textcleaner.py Project: abs51295/gensim

def clean_text_by_word(text, deacc=True):
    """Tokenize a given text into words, applying filters and lemmatize them.

    Parameters
    ----------
    text : str
        Given text.
    deacc : bool, optional
        Remove accentuation if True.

    Returns
    -------
    dict
        Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values.

    Example
    -------
    >>> from gensim.summarization.textcleaner import clean_text_by_word
    >>> clean_text_by_word("God helps those who help themselves")
    {'god': Original unit: 'god' *-*-*-* Processed unit: 'god',
    'help': Original unit: 'help' *-*-*-* Processed unit: 'help',
    'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'}

    """
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc))
    filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
    if HAS_PATTERN:
        tags = tag(join_words(original_words))  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return {unit.text: unit for unit in units}

Example #12

0

Show file

File: 001_create_gensim_data.py Project: harixxy/latentmodels

	def get_texts(self):
		with utils.smart_open(self.datafile) as inputfile:
			for line in inputfile:
				for f in self.preprocess:
					line = f(line)
				text = list(utils.tokenize(line, deacc=True, lowercase=True))
				yield text

Example #13

0

Show file

File: topicanalysis_gensim.py Project: Pranav1210/tctoolkit

def _tokenize_text_file(fname):
    with open(fname, "r") as f:
        doc = f.read()

        for word in utils.tokenize(doc, lowercase=True):
            if word not in STOPWORDS_SET:
                yield word

Example #14

0

Show file

File: similarity_server.py Project: Ryanglambert/PUBmatch

def get_similarity_list(new_doc):
    new_doc = utils.tokenize(new_doc)
    new_doc_bow = pubmed_corpus_lsi.corpus.corpus.dictionary.doc2bow(new_doc)
    new_doc_tfidf = pubmed_tfidf[new_doc_bow]
    new_doc_lsi = pubmed_lsi[new_doc_tfidf]
    new_doc_sims = pubmed_sim[new_doc_lsi]
    return new_doc_sims

Example #15

0

Show file

File: TopicFeatureExtractor.py Project: thejamesmarq/UWT-PAN

	def get_texts(self): 
		for filename in self.input:
			root = ET.fromstring(open(filename).read())
			lang = root.attrib['lang'].lower()
			genre = root.attrib['type']
			tree = ET.ElementTree(root)
			yield tokenize(clean(open(filename).read(),lang,genre,tree))

Example #16

0

Show file

File: analysis.py Project: AdrianRibao/BeerApp

def tokenizer2(d):
    """ Tokenizer that returns a dictionary of the stemmed tokens, with the list of words that were
        transformed into that token.
        :param d: the document (text) to be tokenized
        :type d: unicode
        :rtype dict of tuple """
    def myreducer(d, t):
        """ Receives a dictionary and a tuple of stem and word list. Adds {stem: [currList] + word list} to it.
        :param d: the dictionary
        :type d: dict
        :param t: the tuple, which should be (stem, word list)
        :type t: tuple
        :rtype dict """
        try:
            d[t[0]] += [t[1]]
        except KeyError:
            d[t[0]] = [t[1]]
        except:
            print t
            raise
        return d

    dic = {}
    if len(d):
        lW = [re.sub('[ _]+', ' ', w).strip() for w in utils.tokenize(d) if len(re.sub('[ _]+', ' ', w).strip())]
        lS = [mystem(w) for w in lW]
        lS, lW = processBiGrams(lS, lW)
        l = [(s, w) for s, w in  zip(lS, lW)]
        if len(l) and usesVocab([w[0] for w in l]):
            l2 = removeVocab(l)
            if len(l2):
                dic = reduce(myreducer, l2, {})
    return dic

Example #17

0

Show file

File: dtm.py Project: ashishbaghudana/dtm

 def tokenize(self, content):
     """
     Tokenization according to Wikipedia corpus, where any token less than 2
     characters long and greater than 15 characters long is ignored. The
     token must not start with '_'.
     """
     return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore')
             if 2 <= len(token) <= 15 and not token.startswith('_')]

Example #18

0

Show file

File: ck12_scraper.py Project: kmalta/SmartMultipleChoice

 def parse_paragraphs(self):
     for tag in self.paragraph_parse_tags:
         for element in self.content.find_all(tag):
             text = element.get_text(' ', strip=True).encode('ascii', "ignore")
             # TODO: remove hyperlinks
             text = " ".join(tokenize(text, lowercase=True))
             if text != '':
                 self.paragraphs += [text]

Example #19

0

Show file

File: corpusPreparation.py Project: kobauman/8Q_code

def cleanText(text):
    plain_text = text.lower().replace("\n"," ").replace('ieee transactions on magnetics','')
    plain_text = plain_text.replace('ieee','').replace('abstract','')
    result = list()
    for word in tokenize(plain_text):
        if word not in stoplist and re.search("[a-z]", word) and len(word) > 2:
            result.append(word.encode("utf8"))
    return result

Example #20

0

Show file

File: yelp_corpus.py Project: alei76/big_data_project

def process_review(review):
    #return [token.encode('utf8') for token in utils.tokenize(review, lower=True, errors='ignore')
    #        if 2 <= len(token) <= 15]
    tokens = [token.encode('utf8') for token in utils.tokenize(review, lower=True, errors='ignore')
            if 2 <= len(token) <= 15]
    tokens = [norm(token) for token in tokens if norm(token)]
    tokens = [token for token in tokens if token not in stwords]
    tokens = [stemmer.stem(token) for token in tokens if stemmer.stem(token)]
    return tokens

Example #21

0

Show file

File: Process.py Project: Athiq/word2vecautomation

def getTokensFromEntry(entry):
    if stopwords is None:
        pass
    text = entry.get("review/text")
    if text is None:
        print("Empty Document")
        return ["None"]
    tokens= utils.tokenize(text, lower=True, errors='ignore')
    return tokens

Example #22

0

Show file

File: stat.py Project: hunterhector/LangStats12Spring

 def get_texts(self):
     length = 0
     self.input.seek(0)
     for line in self.input:
         length += 1
         line = re.sub(r"</?s>", "", line)
         line = line.rstrip("\n")
         yield utils.tokenize(line)
     self.length = length

Example #23

0

Show file

File: textcorpus.py Project: AmitShah/gensim

    def get_texts(self):
        """
        Iterate over the collection, yielding one document at a time. A document
        is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.

        Override this function to match your input (parse input files, do any
        text preprocessing, lowercasing, tokenizing etc.). There will be no further
        preprocessing of the words coming out of this function.
        """
        # Instead of raising NotImplementedError, let's provide a sample implementation:
        # assume documents are lines in a single file (one document per line).
        # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
        with self.getstream() as lines:
            for lineno, line in enumerate(lines):
                if self.metadata:
                    yield utils.tokenize(line, lowercase=True), (lineno,)
                else:
                    yield utils.tokenize(line, lowercase=True)

Example #24

0

Show file

File: utils_gensim.py Project: wpli/ptr

 def get_texts(self):
     for path in self.filepaths:
         with codecs.open(path, encoding='utf8') as f:
             raw_text = f.read()
             raw_text = raw_text.lower()
             for filt in self.preprocess:
                 raw_text = filt(raw_text)
             text = list(utils.tokenize(raw_text, deacc=True, lowercase=True))
             yield text

Example #25

0

Show file

File: w2v.py Project: ffuuugor/deepHackQA

def _preprocess_text(text, stem=False):
    """ Performs common atomic operations on one text chunk - tokenization, normalization
    :param text:
    :return:
    """
    words = filter(lambda x: x not in STOPS, map(lambda x: x.lower(), tokenize(text)))
    if stem:
        porter = PorterStemmer()
        words = map(porter.stem, words)
    return words

Example #26

0

Show file

File: dictionary.py Project: AmitShah/keras-language-modeling

    def add(self, text):
        if text is None: return

        from gensim.utils import tokenize

        if isinstance(text, str):
            docs = [tokenize(text, to_lower=True)]
        else:
            docs = [tokenize(t, to_lower=True) for t in text]

        for doc in docs:
            for t in doc:
                if t in self._token_counts:
                    self._token_counts[t] += 1
                else:
                    self._token_counts[t] = 1
                    self.id2token.append(t)
                    self.token2id[t] = self._id
                    self._id += 1

Example #27

0

Show file

File: TopicFeatureExtractor.py Project: thejamesmarq/UWT-PAN

	def get_texts(self): 
		text = ""
		for index in self.input[0]:
			root = ET.fromstring(open(self.input[1][index]).read())
			lang = root.attrib['lang'].lower()
			genre = root.attrib['type']
			tree = ET.ElementTree(root)
			string = clean(open(self.input[1][index]).read(),lang,genre,tree)
			text += string
		yield tokenize(text)

Example #28

0

Show file

File: docsim.py Project: transformersprimeabcxyz/seldon-server-AI-ML

 def get_texts(self,raw=False):
     """
     yield raw text or tokenized text
     """
     for j in self.get_json():
         text = j["text"]
         if raw:
             yield text
         else:
             yield utils.tokenize(text, deacc=True, lowercase=True)

Example #29

0

Show file

File: wikicorpus.py Project: beibeiyang/Latent-Dirichlet-Allocation

def tokenize(content):
    """
    Tokenize a piece of text from wikipedia. The input string `content` is assumed
    to be mark-up free (see `filterWiki()`).
    
    Return tokens as utf8 bytestrings. 
    """
    # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
    return [token.encode('utf8') for token in utils.tokenize(content, lower = True, errors = 'ignore') 
            if len(token) <= 15 and not token.startswith('_')]

Example #30

0

Show file

File: wikicorpus.py Project: chiraggiri/NLQA

def tokenize(content):
    """
    Tokenize a piece of text from wikipedia. The input string `content` is assumed
    to be mark-up free (see `filter_wiki()`).

    Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer
    that 15 characters (not bytes!).
    """
    # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
    return [token.encode('utf8') for token in utils.tokenize(content, lower=False, errors='ignore')
            if 2 <= len(token) <= 15 and not token.startswith('_')]

Example #31

0

Show file

File: preprocessData.py Project: jwzxgy2007/RumorDetectionRNN

    def get_texts(self):
        stoplist = set('for a of the and to in'.split()) # add http?
        for fname in os.listdir(self.dirname):
            W = []
            for line in open(os.path.join(self.dirname, fname)):
                line = re.sub(' "source":(.[^,]+)",', '', line)  # remove json.loads corrupters
                w = json.loads(line)

                # tokenize and remove common words
                w = utils.tokenize(w['text'], lowercase=True)
                w = [word for word in w if word not in stoplist]

                W.extend(w)
            yield W

Example #32

0

Show file

File: wikicorpus.py Project: zjffdu/gensim

def tokenize(content):
    """
    Tokenize a piece of text from wikipedia. The input string `content` is assumed
    to be mark-up free (see `filter_wiki()`).

    Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer
    that 15 characters (not bytes!).
    """
    # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
    return [
        token.encode('utf8')
        for token in utils.tokenize(content, lower=True, errors='ignore')
        if 2 <= len(token) <= 15 and not token.startswith('_')
    ]

Example #33

0

Show file

def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
    """
    Tokenize a piece of text from wikipedia. The input string `content` is assumed
    to be mark-up free (see `filter_wiki()`).

    Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens.

    Return list of tokens as utf8 bytestrings.
    """
    # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
    return [
        utils.to_unicode(token) for token in utils.tokenize(content, lower=lower, errors='ignore')
        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
    ]

Example #34

0

Show file

File: textcorpus.py Project: mikedewar/gensim

 def get_texts(self):
     """
     Iterate over the collection, yielding one document at a time. A document
     is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.
     
     Override this function to match your input (parse input files, do any
     text preprocessing, lowercasing, tokenizing etc.). There will be no further
     preprocessing of the words coming out of this function.
     """
     # Instead of raising NotImplementedError, let's provide a sample implementation:
     # assume documents are lines in a single file (one document per line).
     # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
     for lineno, line in enumerate(getstream(self.input)):
         yield utils.tokenize(line, lowercase=True)

Example #35

0

Show file

    def train_embedding(self):
        print("\nTraining embedding\n")
        encoder = {self.pad_word: 0, self.unk_word: 1}
        # decoder = {0.0: self.pad_word, 1.0: self.unk_word}
        probs = {self.pad_word: 1, self.unk_word: 1}
        data = tfds.load('imdb_reviews/plain_text',
                         split='unsupervised',
                         data_dir=DATA_DIR)
        total_words = 2  # pad and unknown
        entry_count = 2
        max_len = 0
        for text in tfds.as_numpy(data):
            tokens = list(tokenize(str(text), lowercase=True))[3:]
            for idx, word in enumerate(tokens):
                total_words += 1
                if not word in encoder.keys():
                    entry_count += 1
                    encoder[word] = entry_count
                    probs[word] = 1
                else:
                    probs[word] += 1
            if idx > max_len:
                max_len = idx
        print(f"The vocabulary size is {entry_count}")
        print(f"The maximum length of a review is {max_len}")
        probs = {k: v / total_words for k, v in probs.items()}
        probs[self.pad_word] = 1 - np.finfo(np.float32).eps
        probs[self.unk_word] = np.finfo(np.float32).eps

        glove2word2vec(self.glove_input_file, self.model_file)
        model = KeyedVectors.load_word2vec_format(self.model_file,
                                                  binary=False)

        print("Creating matrix")
        skipped_words = 0
        emb_matrix = np.zeros((entry_count, self.vec_len), dtype=np.float32)
        for i, word in enumerate(encoder.keys()):
            try:
                emb_matrix[i] = model[word]
            except:
                skipped_words += 1
                pass

        print(f"Skipped {skipped_words} out of {entry_count}")
        np.save(open(self.matrix_file, 'wb'), emb_matrix)
        pickle.dump(encoder, open(self.encoder_file, 'wb'), protocol=0)
        pickle.dump(probs, open(self.probs_file, 'wb'), protocol=0)

        return encoder, probs, emb_matrix

Example #36

0

Show file

File: TweetD2vCreator.py Project: ucdscenter/covid_social_media

    def _clean_text(self, the_tweet_text):
        cleaned_text = p.clean(the_tweet_text).lower().replace("’", "'")
        words = cleaned_text.split()
        reformed = [
            CONTRACTIONS[word] if word in CONTRACTIONS else word
            for word in words
        ]
        cleaned_text = " ".join(reformed)
        cleaned_text = cleaned_text.translate(
            str.maketrans('', '', string.punctuation))
        cleaned_text = self._removeNonAscii(cleaned_text)
        tokenized_text = list(tokenize(cleaned_text))
        tokenized_text = self._remove_stops(tokenized_text)

        return tokenized_text

Example #37

0

Show file

def process_post(args):
    """Normalize an entry into tokens"""
    content, lemmatize, subject, pageid = args
    text = url_re.sub('', subject + " " + content)

    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = [
            token.encode('utf8')
            for token in utils.tokenize(text, lower=True, errors='ignore')
            if 2 <= len(token) <= 15 and not token.startswith('_')
        ]

    return result, subject, pageid

Example #38

0

Show file

File: fabeec_imbalance.py Project: parasteh/ML-course

def fasttext_model_train(data, from_scratch):
    # Preprocessing like stopword removal @TODO
    ge_sentences = [list(tokenize(s)) for s in data['text'].to_list()]
    if from_scratch:
        model = FastText(bucket=1000000, window=3, min_count=1, size=300)
        model.build_vocab(sentences=ge_sentences)
        model.train(sentences=ge_sentences,
                    total_examples=len(ge_sentences),
                    epochs=10)
    else:
        print("salam")
        model = FastText.load_fasttext_format('content/cc.en.300')
        model.build_vocab(ge_sentences, update=True)
        # model.train(sentences=ge_sentences, total_examples = len(sent), epochs=5)
    return model

Example #39

0

Show file

    def __iter__(self):
        if self._corpus_fpath.endswith(".gz"):
            corpus = gzip.open(self._corpus_fpath, "r", "utf-8")
        else:
            corpus = codecs.open(self._corpus_fpath, "r", "utf-8")

        for line in corpus:
            yield list(
                tokenize(line,
                         lowercase=False,
                         deacc=False,
                         encoding='utf8',
                         errors='strict',
                         to_lower=False,
                         lower=False))

Example #40

0

Show file

File: uci.py Project: Farik/bigartm

 def get_texts(self):
     i = 0
     for fn in self.input:
         if i > 100:
             break
         i += 1
         text = open(fn, 'r').read()
         #yield [CorpusNTA.lmtzr.lemmatize(word) for word in list(utils.tokenize(text, deacc=True, lower=True)) if word not in CorpusNTA.stoplist]
         yield [
             word
             for word in list(utils.tokenize(text, deacc=True, lower=True))
             if word not in CorpusNTA.stoplist
         ]
         if i % 100 == 0:
             print("%d documents processed" % i)

Example #41

0

Show file

def iter_documents(top_directory):
    numFound = 0
    for root, dirs, files in os.walk(top_directory):
        for dir1 in filter(
                lambda newspaper: newspaper !=
                "TheCharlestonMercury-incomplete" and newspaper !=
                "VincennesCourant", dirs):
            #for dir1 in filter(lambda newspaper: newspaper == "TheCharlestonMercury-incomplete" or newspaper == "VincennesCourant" , dirs):
            #print(dir1)
            for root2, dirs2, files2 in os.walk(top_directory + "/" + dir1):
                #print(files2)
                for dir2 in dirs2:
                    for root3, dirs3, files3 in os.walk(top_directory + "/" +
                                                        dir1 + "/" + dir2):
                        for file1 in filter(
                                lambda filee: filee.endswith('.txt'), files3):
                            #print('hi')
                            document = open(
                                os.path.join(root, dir1, dir2, file1)).read()
                            newline = str(
                                numFound
                            ) + "," + dir1 + "/" + dir2 + "/" + file1 + ","
                            mdfile = open(
                                "accessible/" + dir1 + "/" + dir2 + "/" +
                                file1[:len(file1) - 3] + "md", "r")
                            lines = mdfile.readlines()
                            for line in lines:
                                newline += line.split(", ")[1].strip(
                                    "\n") + ","
                            aFile.write(newline[:len(newline) - 1] + "\n")
                            stoplist = set('for a of the and to in'.split())
                            resultwords = [
                                word for word in document.split()
                                if word.lower() not in stoplist
                            ]
                            result = ' '.join(resultwords)

                            cleanedwords = [
                                re.subn("[^a-zA-Z]+", ' ', word)[0]
                                for word in result if '-' not in word
                            ]
                            resultfinal = ''.join(cleanedwords)
                            words = [
                                word.strip() for word in resultfinal.split()
                            ]
                            final = ' '.join(words)
                            numFound += 1
                            yield utils.tokenize(resultfinal, lower=True)

Example #42

0

Show file

File: recommender.py Project: RonanMDONeill/FYP

def create_bow_corpus(textFile, dictionary, outputDir):
    """"
    Creates a Gensim bag-of-words corpus from a Gensim dictionary and saves it
    
    textFile (string): Location of the specified text file used in dictionary generation
    dictionary (Gensim dictionary object): Dictionary object used in corpus generation
    outputDir (string): Location to save the corpus
    """
    outputFile = outputDir + 'DBLP_Corpus.mm'
    with open(textFile, encoding='utf-8') as (iFile):
        corpus = [
            dictionary.doc2bow((tokenize(line)), allow_update=True)
            for line in iFile
        ]
    corpora.MmCorpus.serialize(outputFile, corpus)
    print('Corpus created and stored at: ' + outputFile)
    return corpus

Example #43

0

Show file

File: make_nyt_corpus.py Project: stjordanis/understanding-bias

def get_text_content(root):
    path = './body/body.content/block[@class="full_text"]'
    full_text = root.find(path)
    if (full_text is None):
        return (None, 0)
    text = ''.join(full_text.itertext()).strip()
    # Ditch lead paragraph
    lines = text.split("\n")
    if lines[0].startswith('LEAD:'):
        lines = lines[1:-1]
    text = ' '.join(lines)
    # with open("corpora/scratch.txt", "a") as f:
    #     f.write(text)
    #     f.write("\n---\n")
    tokens = [t.lower() for t in tokenize(text)]
    wc = len(tokens)
    return (' '.join(tokens), wc)

Example #44

0

Show file

def body_topic(dataframe):
    text_body = dataframe['body'].values
    text_body = [remove_stopwords(body) for body in text_body]
    text_body = [
        tokenize(body, deacc="True", lowercase="True") for body in text_body
    ]
    text_body = [[snow.stem(token) for token in word_list]
                 for word_list in text_body]
    dataframe['tokens'] = [list(gen) for gen in text_body]
    dataframe['corpus'] = [
        dictionary.doc2bow(doc) for doc in dataframe['tokens']
    ]
    dataframe['predicted_topic'] = [
        probs_to_topic(topic_probs)
        for topic_probs in topic_model.get_document_topics(dataframe['corpus'])
    ]
    return dataframe

Example #45

0

Show file

File: processdata.py Project: Nitss10/Data-Processing

def preprocess(content):
  """  params -: raw text scrapped from website
       return -: return list of words after:    
                1) tokenization
                2) remove stopwords and some insignificant words
                3) convert in lowercase 
                4) lemmatize 
                5) Remove common web terms """

  content = tokenize(content, deacc=True)
  content = list(filter(is_significant, content))
  content = [token.lower() for token in content]
  MIN_WORDS = 30  #minimum words needed to decide whether site is english or not
  if len(content) > MIN_WORDS and not is_english(content): return ['invalidcontentfound']   #signal for non_engish site 
  content = [lemmatize(token) for token in content if token not in STOPWORDS and token in dictionary]
  content = [token for token in content if token not in AVOID]
  return content

Example #46

0

Show file

def simple_preprocess(doc: str,
                      lower: bool = False,
                      deacc: bool = False,
                      min_len: int = 2,
                      max_len: int = 15) -> List[str]:
    r"""
	Gensim's simple_preprocess adding a 'lower' param to indicate wether or not to
	lower case all the token in the texts

	For more informations see: https://radimrehurek.com/gensim/utils.html
	"""
    tokens = [
        token
        for token in tokenize(doc, lower=False, deacc=deacc, errors='ignore')
        if min_len <= len(token) <= max_len and not token.startswith('_')
    ]
    return tokens

Example #47

0

Show file

File: utils.py Project: finiteautomata/offenseval2020

def gen_vocab(tweets):
    vocab, reverse_vocab = {}, {}
    vocab_index = 1
    for tweet in tweets:
        text = tokenize(tweet.lower())
        text = ' '.join([c for c in text if c not in punctuation])
        words = text.split()
        words = [word for word in words if word not in STOPWORDS]
        for word in words:
            if word not in vocab:
                vocab[word] = vocab_index
                reverse_vocab[
                    vocab_index] = word  # generate reverse vocab as well
                vocab_index += 1
    vocab['UNK'] = len(vocab) + 1
    reverse_vocab[len(vocab)] = 'UNK'
    return vocab

Example #48

0

Show file

File: cngrec_corpus.py Project: inurutdinov/eaa

def lemmatize(content):
    """
	Use the English lemmatizer from `pattern` to extract tokens in
	their base form=lemma, e.g. "are, is, being" -> "be" etc.
	This is a smarter version of stemming, taking word context into account.

	Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
    """
    content = u' '.join(utils.tokenize(content, lower=True, errors='ignore'))
    parsed = parse(content, lemmata=True, collapse=False)
    result = []
    for sentence in parsed:
        for token, tag, _, _, lemma in sentence:
            if 2 <= len(lemma) <= 15 and not lemma.startswith('_'):
                if utils.ALLOWED_TAGS.match(tag):
                    result.append(lemma.encode('utf8'))
    return result

Example #49

0

Show file

File: word2vec.py Project: ztong001/TwitterFYP

def word2vec_classifier(dataset):
    documents = []
    for line in dataset:
        # Wrapper method for tokenizing with
        tokens = tokenize(line[0], lower=True)
        sentence = LabeledSentence(tokens, line[1])
        documents.append(sentence)
    log.info("Doc2Vec %d lines" % (len(documents)))
    # Model parameters
    num_features = 100
    min_word_count = 1
    num_workers = 8
    context = 2
    downsampling = 1e-3
    d2v_model = Doc2Vec(min_count=min_word_count,
                        window=context,
                        size=num_features,
                        sample=downsampling,
                        workers=num_workers)
    log.info("Training doc vectors")
    train_set, test_set = train_test_split(documents,
                                           train_size=0.7,
                                           test_size=0.3)
    train_vec = getAvgFeatureVecs(train_set, d2v_model, num_features)
    test_vec = getAvgFeatureVecs(test_set, d2v_model, num_features)
    train_vec = Imputer().fit_transform(train_vec)
    test_vec = Imputer().fit_transform(test_vec)

    # train model and predict with LinearSVC
    model = LinearSVC()
    classifier_fitted = OneVsRestClassifier(model).fit(train_vec, train_set[1])
    result = classifier_fitted.predict(test_vec)

    # output result to csv
    result.tofile("./d2v_linsvc.csv", sep='\t')

    # store the model to mmap-able files
    joblib.dump(model, 'model/%s.pkl' % 'd2v_linsvc')

    # evaluation
    label_score = classifier_fitted.decision_function(test_vec)
    binarise_result = label_binarize(result, classes=class_list)
    binarise_labels = label_binarize(class_list, classes=class_list)

    # generate_eval_metrics(binarise_result, 'w2v_linsvc', binarise_labels)
    generate_report(binarise_result, 'w2v_linsvc', binarise_labels)

Example #50

0

Show file

File: preprocessing.py Project: mikekestemont/verification

def prepare_corpus(dirname, text_cutoff=1000000):
    underscore = re.compile(r'\_')
    authors, titles, texts = [], [], []
    for filename in sorted(glob.glob(dirname + "/*")):
        if '_' in filename:
            author, title = underscore.split(
                os.path.split(filename)[-1].replace(".txt", ""), maxsplit=1)
        else:
            author, title = next(DUMMY_AUTHORS), os.path.basename(
                filename).replace(".txt", "")
        authors.append(author)
        titles.append(title)
        with open(filename) as infile:
            texts.append(
                list(
                    islice(tokenize(infile.read(), lowercase=True, deacc=True),
                           0, text_cutoff)))
    return Dataset(texts, titles, authors)

Example #51

0

Show file

File: textcleaner.py Project: sajjad5221/Persian-Summarization

def clean_text_by_word(text, deacc=True):
    """ Tokenizes a given text into words, applying filters and lemmatizing them.
    Returns a dict of word -> syntacticUnit. """
    text_without_acronyms = replace_with_separator(text, "",
                                                   [AB_ACRONYM_LETTERS])
    original_words = list(
        tokenize(text_without_acronyms, to_lower=True, deacc=deacc))
    filtered_words = [
        join_words(word_list, "")
        for word_list in preprocess_documents(original_words)
    ]
    if HAS_PATTERN:
        tags = tagger.tag(
            original_words)  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return dict((unit.text, unit) for unit in units)

Example #52

0

Show file

File: preprocess.py Project: aascode/bipolar-disorder

def tokenize_tr(content, token_min_len=2, token_max_len=50, lower=True):
    """tokenize words in the corpus
    """
    if lower:
        lower_map = {
            ord(u'A'): u'a',
            ord(u'A'): u'a',
            ord(u'B'): u'b',
            ord(u'C'): u'c',
            ord(u'Ç'): u'ç',
            ord(u'D'): u'd',
            ord(u'E'): u'e',
            ord(u'F'): u'f',
            ord(u'G'): u'g',
            ord(u'Ğ'): u'ğ',
            ord(u'H'): u'h',
            ord(u'I'): u'ı',
            ord(u'İ'): u'i',
            ord(u'J'): u'j',
            ord(u'K'): u'k',
            ord(u'L'): u'l',
            ord(u'M'): u'm',
            ord(u'N'): u'n',
            ord(u'O'): u'o',
            ord(u'Ö'): u'ö',
            ord(u'P'): u'p',
            ord(u'R'): u'r',
            ord(u'S'): u's',
            ord(u'Ş'): u'ş',
            ord(u'T'): u't',
            ord(u'U'): u'u',
            ord(u'Ü'): u'ü',
            ord(u'V'): u'v',
            ord(u'Y'): u'y',
            ord(u'Z'): u'z'
        }
        content = content.translate(lower_map)

    return [
        utils.to_unicode(token)
        for token in utils.tokenize(content, lower=False, errors='ignore')
        if token_min_len <= len(token) <= token_max_len
        and not token.startswith('_')
    ]

Example #53

0

Show file

File: train.py Project: chizhijing/MachineLearning

def get_papar_words(author_papers):
    paper_character = []
    for i, paper in enumerate(author_papers):
        title = [word.lower() for word in tokenize(paper['title'])]
        abstract = []
        keywords = []
        text = []
        #        if 'abstract' in paper.keys() and paper['abstract'] is not None:
        #            abstract=[word.lower() for word in tokenize(paper['abstract'])]
        if 'keywords' in paper.keys() and paper['keywords'] is not None:
            keywords = [word.lower() for word in paper['keywords']]

        text = title + abstract + keywords  # 合并title,keywords,abstract
        text = [
            word for word in text if (word not in my_stopwords) and (
                word not in stopwords.words('english'))
        ]
        paper_character.append(text)
    return paper_character

Example #54

0

Show file

def preprocess_text(document):
    """
    Performs advanced preprocessing on a string and returns lemmatized list of tokens.

    :param document: Document string to be preprocessed
    :return: List of preprocessed tokens
    """

    stop_words = nltk.corpus.stopwords.words('english')
    stop_words.extend(CUSTOM_STOP_WORDS)
    en_stop = set(stop_words)
    tokens = utils.tokenize(document, lowercase=True, deacc=True)
    tokens = [
        str(token) for token in tokens
        if (token not in en_stop and not token.startswith('_'))
    ]
    tokens = [get_lemma2(token) for token in tokens]

    return tokens

Example #55

0

Show file

 def __iter__(self):
     for directory in CowReader.dirs:
         with codecs.open(os.path.join(CowReader.root, directory,
                                       directory + ".xml"),
                          encoding='utf-8') as infile:
             sentence = []
             for line in infile:
                 if line.startswith('<s'):
                     continue
                 elif line.startswith('</s>'):
                     yield sentence
                     sentence = []
                 else:
                     word, pos, lemma = line.strip().split('\t')
                     if pos not in ('$.', 'punc'):
                         sentence.append(word.lower())
     with codecs.open(CowReader.vvb, encoding='utf-8') as vvb:
         for sentence in vvb:
             yield list(tokenize(sentence, lowercase=True))

Example #56

0

Show file

    def prepare_text(self, plain_text):
        tokens = list(tokenize(plain_text))
        tokens = [x for x in tokens if x.lower() not in STOPWORDS]
        plain_text = " ".join(tokens)

        bigram_mdl = Phrases(tokens, min_count=1, threshold=2)
        custom_filters = [strip_punctuation, strip_numeric]
        tokens = preprocess_string(plain_text, custom_filters)
        tokens = [t for t in tokens if len(t) > 2]
        bigrams = bigram_mdl[tokens]
        words = list(bigrams)

        words = [re.sub('_', '-', word) for word in words]
        vecs = [
            self.word2vec[word] if word in self.word2vec.keys() else np.zeros(
                shape=(1, 20)) for word in words
        ]
        # return list of arrays, each array is  vector of a single word
        return vecs

Example #57

0

Show file

    def get_texts(self):
        stoplist = set('for a of the and to in'.split())  # add http?
        for fname in os.listdir(self.dirname):
            W = []
            print(os.path.join(self.dirname, fname))
            for line in io.open(os.path.join(self.dirname, fname),
                                'r',
                                encoding='windows-1252'):
                line = re.sub(' "source":(.[^,]+)",', '',
                              line)  # remove json.loads corrupters
                line = re.sub("(?<=[^a-z])(')(?=.)|(?<=.)(')(?=[^a-z])", '"',
                              line)
                w = json.loads(line)

                # tokenize and remove common words
                w = utils.tokenize(w['text'], lowercase=True)
                w = [word for word in w if word not in stoplist]

                W.extend(w)
            yield W

Example #58

0

Show file

def predict(text):
    x = np.zeros((1, timesteps, 300), dtype=np.float32)

    tokens = tokenize(text)

    mj = 0

    for w in tokens:
        if (mj < timesteps):

            try:
                x[0][mj] = w2v.word_vec(w)
                mj += 1
            except:
                continue

        else:
            break

    return model.predict(x)

Example #59

0

Show file

File: textcleaner.py Project: Zarak36/NLP

def clean_text_by_word(text, deacc=True):
    """Tokenize a given text into words, applying filters and lemmatize them.

    Parameters
    ----------
    text : str
        Given text.
    deacc : bool, optional
        Remove accentuation if True.

    Returns
    -------
    dict
        Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values.

    Example
    -------
    .. sourcecode:: pycon

        >>> from gensim.summarization.textcleaner import clean_text_by_word
        >>> clean_text_by_word("God helps those who help themselves")
        {'god': Original unit: 'god' *-*-*-* Processed unit: 'god',
        'help': Original unit: 'help' *-*-*-* Processed unit: 'help',
        'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'}

    """
    text_without_acronyms = replace_with_separator(text, "",
                                                   [AB_ACRONYM_LETTERS])
    original_words = list(
        tokenize(text_without_acronyms, to_lower=True, deacc=deacc))
    filtered_words = [
        join_words(word_list, "")
        for word_list in preprocess_documents(original_words)
    ]
    if HAS_PATTERN:
        tags = tag(join_words(
            original_words))  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return {unit.text: unit for unit in units}

Example #60

0

Show file

File: preprocessor.py Project: herooutman/jargon

 def _extract_texts(self, content):
     # remove all \n
     # gensim's tokenizer and to lowercase
     # remove stop words
     # remove infrequent words
     try:
         title = content.get("title", "")
         comments = content.get("comments", [])
         comments = map(lambda x: x.replace("\n", " "), comments)
         raw_texts = "{} {}".format(title, "\n".join(comments))
         # tokenize
         # from nltk.tokenize import RegexpTokenizer
         # tokenizer = RegexpTokenizer('[a-zA-Z][a-zA-Z0-9]*')
         tokens = list(tokenize(raw_texts, lower=True, deacc=True))
         # stopwords
         clean_tokens = [t for t in tokens if t not in en_stopwords]
         # infrequent words
         texts = " ".join(clean_tokens)
         return texts
     except Exception as e:
         traceback.print_exc()