Example #1
0
def stem_tokenize(doc, deacc=True, lowercase=True, errors="strict", stemmer=None):
    """ Split into words and stem that word if a stemmer is given"""
    if stemmer is None:
        for token in tokenize(doc, lowercase=lowercase, deacc=deacc, errors=errors):
            yield token
    else:
         for token in tokenize(doc, lowercase=lowercase, deacc=deacc, errors=errors):
            yield stemmer.stemWord(token)
    def convert(self, text):
        from gensim.utils import tokenize
        from numpy import asarray

        if isinstance(text, str):
            docs = [tokenize(text, to_lower=True, deacc=True)]
        else:
            docs = [tokenize(t, to_lower=True, deacc=True) for t in text]

        return [asarray([self(t) for t in doc], dtype='int32') for doc in docs]
Example #3
0
 def tokenize(self,content, BytesOrNot=False):
     """
     Tokenize a piece of text.
     Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer
     that 15 characters (not bytes!).
     """
 # https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/corpora/wikicorpus.py#L166
 # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
     if BytesOrNot:
         return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore')
             if 2 <= len(token) <= 15 and not token.startswith('_')] # return a list of bytes of characters
     else: return list(utils.tokenize(content, lower=True, errors='ignore')) # return a list of strings
Example #4
0
 def characters(self, text):
     # for text, we only care about tokens directly within the <p> tag
     if self.path[-1] == 'p':
         tokens = [
             token.encode('utf8') for token in utils.tokenize(text, errors='ignore') if not token.isdigit()
         ]
         self.tokens.extend(tokens)
    def tokenize(self, document):
        """
        Break text into sentences and each sentence into a list of single words
        Ignore any token that falls into the stopwords set.
        """
        # use sentence tokenizer sent_tokenize from nltk package
        sentences = sent_tokenize(utils.to_unicode(document.lower()))

        # create stemmer of class SnowballStemmer
        stemmer = SnowballStemmer("english")

        for sentence in sentences:
            words = [word
                   for word in utils.tokenize(
                    self.cleanse_text(sentence)
                   )]

            if self.remove_stopwords:
                words = [ 
                         word for word in words 
                         if word not in self.en_stopwords
                        ]

            if self.stemming:
                words = [stemmer.stem(t) for t in words]

            yield words
def iter_documents(top_directory):
	numFound = 0
	for root,dirs,files in os.walk(top_directory):
		for dir1 in filter(lambda newspaper: newspaper != "TheCharlestonMercury-incomplete" and newspaper != "VincennesCourant" , dirs):
		#for dir1 in filter(lambda newspaper: newspaper == "TheCharlestonMercury-incomplete" or newspaper == "VincennesCourant" , dirs):
			#print(dir1)
			for root2, dirs2,files2 in os.walk(top_directory + "/" + dir1):
				#print(files2)
				for dir2 in dirs2:
					for root3, dirs3, files3 in os.walk(top_directory + "/" + dir1 + "/" +  dir2):
						for file1 in filter(lambda filee: filee.endswith('.txt'),files3):
							#print('hi')
							document = open(os.path.join(root,dir1, dir2,file1)).read()
							newline = str(numFound) + "," + dir1 + "/" + dir2 + "/" + file1 + ","
							mdfile = open("accessible/" + dir1 + "/" +  dir2 + "/" +  file1[:len(file1) - 3] + "md", "r")
							lines = mdfile.readlines()
							for line in lines:
								newline += line.split(", ")[1].strip("\n") + ","
							aFile.write(newline[:len(newline) - 1] + "\n")
							stoplist = set('for a of the and to in'.split())
							resultwords = [word for word in document.split() if word.lower() not in stoplist]
							result = ' '.join(resultwords)
													
							cleanedwords = [re.subn("[^a-zA-Z]+", ' ', word)[0] for word in result if '-' not in word]
							resultfinal = ''.join(cleanedwords)
							words = [word.strip() for word in resultfinal.split()]
							final = ' '.join(words)
							numFound += 1
							yield utils.tokenize(resultfinal, lower=True)
def build_word_vector(n=0, mincount=1):
    sbcs = texeval_corpus.test_subcorpora
    sbc = sbcs[n]
    corpus_name = 'WIKI_'+sbc
    sentences = []
    current_term = ""
    with io.open(corpus_name, 'r', encoding='utf8') as fin:
        for line in fin:
            if '\t' in line:
                current_term = line.strip().split('\t')[1]
            if line.strip().endswith('.'):
                if current_term in line:
                    if ' is a ' in line:
                        line = line.replace(' is a ', ' is_a ')
                    if ' is an ' in line:
                        line = line.replace(' is an ', ' is_a ')
                    
                    # Single tokenize terms.
                    depunct_term = "".join(['_' if ch in string.punctuation or 
                                            ch == ' ' else ch 
                                            for ch in current_term])
                    line = line.replace(current_term, depunct_term).lower()
                    sentences.append(list(tokenize(line)))
    bigram_transformer = Phrases(sentences)
    model = Word2Vec(bigram_transformer[sentences], size=100, window=5, 
                     min_count=mincount, workers=3, iter=100)
    model.save(corpus_name+'.100epochs.phrasal.singletok.min'+str(mincount)+'.deep')
Example #8
0
def tokenize(s, tokenizer):
    """
    Tokenizes a string. Returns a different list of tokens depending on which tokenizer is used.

    :param s: string to be tokenized
    :type s: str
    :param tokenizer: identifies tokenizer to use
    :type tokenizer: str
    :return: list of tokens
    :rtype: []
    """
    tokens = (twokenize.tokenize(s)
              if tokenizer is 'twokenize'
              else (utils.tokenize(s, lower=True)
                    if tokenizer is 'gensim'
                    else (TweetTokenizer(preserve_case=False)).tokenize(s)))

    # list of symbols that can end sentences. twokenize has found these to not be attached to another token.
    # (safe to remove)
    punct = r'.,!!!!????!:;'

    # NLTK english stopwords
    stoplist = stopwords.words('english')

    result = [tok.lower() for tok in tokens if tok not in punct]
    result = [tok for tok in result if tok not in stoplist]
    return result
Example #9
0
def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
    """Tokenize a piece of text from wikipedia.

    Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens.

    Parameters
    ----------
    content : str
        String without markup (see :func:`~gensim.corpora.wikicorpus.filter_wiki`).
    token_min_len : int
        Minimal token length.
    token_max_len : int
        Maximal token length.
    lower : bool
         If True - convert `content` to lower case.

    Returns
    -------
    list of str
        List of tokens from `content`.

    """
    # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
    return [
        utils.to_unicode(token) for token in utils.tokenize(content, lower=lower, errors='ignore')
        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
    ]
Example #10
0
def tokenize_by_word(text):
    """Tokenize input text. Before tokenizing transforms text to lower case and removes accentuation and acronyms set
    :const:`~gensim.summarization.textcleaner.AB_ACRONYM_LETTERS`.

    Parameters
    ----------
    text : str
        Given text.

    Returns
    -------
    generator
        Generator that yields sequence words of the given text.

    Example
    -------
    >>> from gensim.summarization.textcleaner import tokenize_by_word
    >>> g = tokenize_by_word('Veni. Vedi. Vici.')
    >>> print(next(g))
    veni
    >>> print(next(g))
    vedi
    >>> print(next(g))
    vici

    """
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    return tokenize(text_without_acronyms, to_lower=True, deacc=True)
Example #11
0
def clean_text_by_word(text, deacc=True):
    """Tokenize a given text into words, applying filters and lemmatize them.

    Parameters
    ----------
    text : str
        Given text.
    deacc : bool, optional
        Remove accentuation if True.

    Returns
    -------
    dict
        Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values.

    Example
    -------
    >>> from gensim.summarization.textcleaner import clean_text_by_word
    >>> clean_text_by_word("God helps those who help themselves")
    {'god': Original unit: 'god' *-*-*-* Processed unit: 'god',
    'help': Original unit: 'help' *-*-*-* Processed unit: 'help',
    'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'}

    """
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc))
    filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
    if HAS_PATTERN:
        tags = tag(join_words(original_words))  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return {unit.text: unit for unit in units}
	def get_texts(self):
		with utils.smart_open(self.datafile) as inputfile:
			for line in inputfile:
				for f in self.preprocess:
					line = f(line)
				text = list(utils.tokenize(line, deacc=True, lowercase=True))
				yield text
def _tokenize_text_file(fname):
    with open(fname, "r") as f:
        doc = f.read()

        for word in utils.tokenize(doc, lowercase=True):
            if word not in STOPWORDS_SET:
                yield word
def get_similarity_list(new_doc):
    new_doc = utils.tokenize(new_doc)
    new_doc_bow = pubmed_corpus_lsi.corpus.corpus.dictionary.doc2bow(new_doc)
    new_doc_tfidf = pubmed_tfidf[new_doc_bow]
    new_doc_lsi = pubmed_lsi[new_doc_tfidf]
    new_doc_sims = pubmed_sim[new_doc_lsi]
    return new_doc_sims
	def get_texts(self): 
		for filename in self.input:
			root = ET.fromstring(open(filename).read())
			lang = root.attrib['lang'].lower()
			genre = root.attrib['type']
			tree = ET.ElementTree(root)
			yield tokenize(clean(open(filename).read(),lang,genre,tree))
Example #16
0
def tokenizer2(d):
    """ Tokenizer that returns a dictionary of the stemmed tokens, with the list of words that were
        transformed into that token.
        :param d: the document (text) to be tokenized
        :type d: unicode
        :rtype dict of tuple """
    def myreducer(d, t):
        """ Receives a dictionary and a tuple of stem and word list. Adds {stem: [currList] + word list} to it.
        :param d: the dictionary
        :type d: dict
        :param t: the tuple, which should be (stem, word list)
        :type t: tuple
        :rtype dict """
        try:
            d[t[0]] += [t[1]]
        except KeyError:
            d[t[0]] = [t[1]]
        except:
            print t
            raise
        return d

    dic = {}
    if len(d):
        lW = [re.sub('[ _]+', ' ', w).strip() for w in utils.tokenize(d) if len(re.sub('[ _]+', ' ', w).strip())]
        lS = [mystem(w) for w in lW]
        lS, lW = processBiGrams(lS, lW)
        l = [(s, w) for s, w in  zip(lS, lW)]
        if len(l) and usesVocab([w[0] for w in l]):
            l2 = removeVocab(l)
            if len(l2):
                dic = reduce(myreducer, l2, {})
    return dic
Example #17
0
 def tokenize(self, content):
     """
     Tokenization according to Wikipedia corpus, where any token less than 2
     characters long and greater than 15 characters long is ignored. The
     token must not start with '_'.
     """
     return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore')
             if 2 <= len(token) <= 15 and not token.startswith('_')]
 def parse_paragraphs(self):
     for tag in self.paragraph_parse_tags:
         for element in self.content.find_all(tag):
             text = element.get_text(' ', strip=True).encode('ascii', "ignore")
             # TODO: remove hyperlinks
             text = " ".join(tokenize(text, lowercase=True))
             if text != '':
                 self.paragraphs += [text]
Example #19
0
def cleanText(text):
    plain_text = text.lower().replace("\n"," ").replace('ieee transactions on magnetics','')
    plain_text = plain_text.replace('ieee','').replace('abstract','')
    result = list()
    for word in tokenize(plain_text):
        if word not in stoplist and re.search("[a-z]", word) and len(word) > 2:
            result.append(word.encode("utf8"))
    return result
Example #20
0
def process_review(review):
    #return [token.encode('utf8') for token in utils.tokenize(review, lower=True, errors='ignore')
    #        if 2 <= len(token) <= 15]
    tokens = [token.encode('utf8') for token in utils.tokenize(review, lower=True, errors='ignore')
            if 2 <= len(token) <= 15]
    tokens = [norm(token) for token in tokens if norm(token)]
    tokens = [token for token in tokens if token not in stwords]
    tokens = [stemmer.stem(token) for token in tokens if stemmer.stem(token)]
    return tokens
Example #21
0
def getTokensFromEntry(entry):
    if stopwords is None:
        pass
    text = entry.get("review/text")
    if text is None:
        print("Empty Document")
        return ["None"]
    tokens= utils.tokenize(text, lower=True, errors='ignore')
    return tokens
Example #22
0
 def get_texts(self):
     length = 0
     self.input.seek(0)
     for line in self.input:
         length += 1
         line = re.sub(r"</?s>", "", line)
         line = line.rstrip("\n")
         yield utils.tokenize(line)
     self.length = length
Example #23
0
    def get_texts(self):
        """
        Iterate over the collection, yielding one document at a time. A document
        is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.

        Override this function to match your input (parse input files, do any
        text preprocessing, lowercasing, tokenizing etc.). There will be no further
        preprocessing of the words coming out of this function.
        """
        # Instead of raising NotImplementedError, let's provide a sample implementation:
        # assume documents are lines in a single file (one document per line).
        # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
        with self.getstream() as lines:
            for lineno, line in enumerate(lines):
                if self.metadata:
                    yield utils.tokenize(line, lowercase=True), (lineno,)
                else:
                    yield utils.tokenize(line, lowercase=True)
Example #24
0
 def get_texts(self):
     for path in self.filepaths:
         with codecs.open(path, encoding='utf8') as f:
             raw_text = f.read()
             raw_text = raw_text.lower()
             for filt in self.preprocess:
                 raw_text = filt(raw_text)
             text = list(utils.tokenize(raw_text, deacc=True, lowercase=True))
             yield text
Example #25
0
def _preprocess_text(text, stem=False):
    """ Performs common atomic operations on one text chunk - tokenization, normalization
    :param text:
    :return:
    """
    words = filter(lambda x: x not in STOPS, map(lambda x: x.lower(), tokenize(text)))
    if stem:
        porter = PorterStemmer()
        words = map(porter.stem, words)
    return words
    def add(self, text):
        if text is None: return

        from gensim.utils import tokenize

        if isinstance(text, str):
            docs = [tokenize(text, to_lower=True)]
        else:
            docs = [tokenize(t, to_lower=True) for t in text]

        for doc in docs:
            for t in doc:
                if t in self._token_counts:
                    self._token_counts[t] += 1
                else:
                    self._token_counts[t] = 1
                    self.id2token.append(t)
                    self.token2id[t] = self._id
                    self._id += 1
	def get_texts(self): 
		text = ""
		for index in self.input[0]:
			root = ET.fromstring(open(self.input[1][index]).read())
			lang = root.attrib['lang'].lower()
			genre = root.attrib['type']
			tree = ET.ElementTree(root)
			string = clean(open(self.input[1][index]).read(),lang,genre,tree)
			text += string
		yield tokenize(text)
 def get_texts(self,raw=False):
     """
     yield raw text or tokenized text
     """
     for j in self.get_json():
         text = j["text"]
         if raw:
             yield text
         else:
             yield utils.tokenize(text, deacc=True, lowercase=True)
def tokenize(content):
    """
    Tokenize a piece of text from wikipedia. The input string `content` is assumed
    to be mark-up free (see `filterWiki()`).
    
    Return tokens as utf8 bytestrings. 
    """
    # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
    return [token.encode('utf8') for token in utils.tokenize(content, lower = True, errors = 'ignore') 
            if len(token) <= 15 and not token.startswith('_')]
Example #30
0
def tokenize(content):
    """
    Tokenize a piece of text from wikipedia. The input string `content` is assumed
    to be mark-up free (see `filter_wiki()`).

    Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer
    that 15 characters (not bytes!).
    """
    # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
    return [token.encode('utf8') for token in utils.tokenize(content, lower=False, errors='ignore')
            if 2 <= len(token) <= 15 and not token.startswith('_')]
    def get_texts(self):
        stoplist = set('for a of the and to in'.split()) # add http?
        for fname in os.listdir(self.dirname):
            W = []
            for line in open(os.path.join(self.dirname, fname)):
                line = re.sub(' "source":(.[^,]+)",', '', line)  # remove json.loads corrupters
                w = json.loads(line)

                # tokenize and remove common words
                w = utils.tokenize(w['text'], lowercase=True)
                w = [word for word in w if word not in stoplist]

                W.extend(w)
            yield W
Example #32
0
def tokenize(content):
    """
    Tokenize a piece of text from wikipedia. The input string `content` is assumed
    to be mark-up free (see `filter_wiki()`).

    Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer
    that 15 characters (not bytes!).
    """
    # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
    return [
        token.encode('utf8')
        for token in utils.tokenize(content, lower=True, errors='ignore')
        if 2 <= len(token) <= 15 and not token.startswith('_')
    ]
Example #33
0
def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
    """
    Tokenize a piece of text from wikipedia. The input string `content` is assumed
    to be mark-up free (see `filter_wiki()`).

    Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens.

    Return list of tokens as utf8 bytestrings.
    """
    # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
    return [
        utils.to_unicode(token) for token in utils.tokenize(content, lower=lower, errors='ignore')
        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
    ]
Example #34
0
 def get_texts(self):
     """
     Iterate over the collection, yielding one document at a time. A document
     is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.
     
     Override this function to match your input (parse input files, do any
     text preprocessing, lowercasing, tokenizing etc.). There will be no further
     preprocessing of the words coming out of this function.
     """
     # Instead of raising NotImplementedError, let's provide a sample implementation:
     # assume documents are lines in a single file (one document per line).
     # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
     for lineno, line in enumerate(getstream(self.input)):
         yield utils.tokenize(line, lowercase=True)
Example #35
0
    def train_embedding(self):
        print("\nTraining embedding\n")
        encoder = {self.pad_word: 0, self.unk_word: 1}
        # decoder = {0.0: self.pad_word, 1.0: self.unk_word}
        probs = {self.pad_word: 1, self.unk_word: 1}
        data = tfds.load('imdb_reviews/plain_text',
                         split='unsupervised',
                         data_dir=DATA_DIR)
        total_words = 2  # pad and unknown
        entry_count = 2
        max_len = 0
        for text in tfds.as_numpy(data):
            tokens = list(tokenize(str(text), lowercase=True))[3:]
            for idx, word in enumerate(tokens):
                total_words += 1
                if not word in encoder.keys():
                    entry_count += 1
                    encoder[word] = entry_count
                    probs[word] = 1
                else:
                    probs[word] += 1
            if idx > max_len:
                max_len = idx
        print(f"The vocabulary size is {entry_count}")
        print(f"The maximum length of a review is {max_len}")
        probs = {k: v / total_words for k, v in probs.items()}
        probs[self.pad_word] = 1 - np.finfo(np.float32).eps
        probs[self.unk_word] = np.finfo(np.float32).eps

        glove2word2vec(self.glove_input_file, self.model_file)
        model = KeyedVectors.load_word2vec_format(self.model_file,
                                                  binary=False)

        print("Creating matrix")
        skipped_words = 0
        emb_matrix = np.zeros((entry_count, self.vec_len), dtype=np.float32)
        for i, word in enumerate(encoder.keys()):
            try:
                emb_matrix[i] = model[word]
            except:
                skipped_words += 1
                pass

        print(f"Skipped {skipped_words} out of {entry_count}")
        np.save(open(self.matrix_file, 'wb'), emb_matrix)
        pickle.dump(encoder, open(self.encoder_file, 'wb'), protocol=0)
        pickle.dump(probs, open(self.probs_file, 'wb'), protocol=0)

        return encoder, probs, emb_matrix
    def _clean_text(self, the_tweet_text):
        cleaned_text = p.clean(the_tweet_text).lower().replace("’", "'")
        words = cleaned_text.split()
        reformed = [
            CONTRACTIONS[word] if word in CONTRACTIONS else word
            for word in words
        ]
        cleaned_text = " ".join(reformed)
        cleaned_text = cleaned_text.translate(
            str.maketrans('', '', string.punctuation))
        cleaned_text = self._removeNonAscii(cleaned_text)
        tokenized_text = list(tokenize(cleaned_text))
        tokenized_text = self._remove_stops(tokenized_text)

        return tokenized_text
Example #37
0
def process_post(args):
    """Normalize an entry into tokens"""
    content, lemmatize, subject, pageid = args
    text = url_re.sub('', subject + " " + content)

    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = [
            token.encode('utf8')
            for token in utils.tokenize(text, lower=True, errors='ignore')
            if 2 <= len(token) <= 15 and not token.startswith('_')
        ]

    return result, subject, pageid
Example #38
0
def fasttext_model_train(data, from_scratch):
    # Preprocessing like stopword removal @TODO
    ge_sentences = [list(tokenize(s)) for s in data['text'].to_list()]
    if from_scratch:
        model = FastText(bucket=1000000, window=3, min_count=1, size=300)
        model.build_vocab(sentences=ge_sentences)
        model.train(sentences=ge_sentences,
                    total_examples=len(ge_sentences),
                    epochs=10)
    else:
        print("salam")
        model = FastText.load_fasttext_format('content/cc.en.300')
        model.build_vocab(ge_sentences, update=True)
        # model.train(sentences=ge_sentences, total_examples = len(sent), epochs=5)
    return model
Example #39
0
    def __iter__(self):
        if self._corpus_fpath.endswith(".gz"):
            corpus = gzip.open(self._corpus_fpath, "r", "utf-8")
        else:
            corpus = codecs.open(self._corpus_fpath, "r", "utf-8")

        for line in corpus:
            yield list(
                tokenize(line,
                         lowercase=False,
                         deacc=False,
                         encoding='utf8',
                         errors='strict',
                         to_lower=False,
                         lower=False))
Example #40
0
File: uci.py Project: Farik/bigartm
 def get_texts(self):
     i = 0
     for fn in self.input:
         if i > 100:
             break
         i += 1
         text = open(fn, 'r').read()
         #yield [CorpusNTA.lmtzr.lemmatize(word) for word in list(utils.tokenize(text, deacc=True, lower=True)) if word not in CorpusNTA.stoplist]
         yield [
             word
             for word in list(utils.tokenize(text, deacc=True, lower=True))
             if word not in CorpusNTA.stoplist
         ]
         if i % 100 == 0:
             print("%d documents processed" % i)
Example #41
0
def iter_documents(top_directory):
    numFound = 0
    for root, dirs, files in os.walk(top_directory):
        for dir1 in filter(
                lambda newspaper: newspaper !=
                "TheCharlestonMercury-incomplete" and newspaper !=
                "VincennesCourant", dirs):
            #for dir1 in filter(lambda newspaper: newspaper == "TheCharlestonMercury-incomplete" or newspaper == "VincennesCourant" , dirs):
            #print(dir1)
            for root2, dirs2, files2 in os.walk(top_directory + "/" + dir1):
                #print(files2)
                for dir2 in dirs2:
                    for root3, dirs3, files3 in os.walk(top_directory + "/" +
                                                        dir1 + "/" + dir2):
                        for file1 in filter(
                                lambda filee: filee.endswith('.txt'), files3):
                            #print('hi')
                            document = open(
                                os.path.join(root, dir1, dir2, file1)).read()
                            newline = str(
                                numFound
                            ) + "," + dir1 + "/" + dir2 + "/" + file1 + ","
                            mdfile = open(
                                "accessible/" + dir1 + "/" + dir2 + "/" +
                                file1[:len(file1) - 3] + "md", "r")
                            lines = mdfile.readlines()
                            for line in lines:
                                newline += line.split(", ")[1].strip(
                                    "\n") + ","
                            aFile.write(newline[:len(newline) - 1] + "\n")
                            stoplist = set('for a of the and to in'.split())
                            resultwords = [
                                word for word in document.split()
                                if word.lower() not in stoplist
                            ]
                            result = ' '.join(resultwords)

                            cleanedwords = [
                                re.subn("[^a-zA-Z]+", ' ', word)[0]
                                for word in result if '-' not in word
                            ]
                            resultfinal = ''.join(cleanedwords)
                            words = [
                                word.strip() for word in resultfinal.split()
                            ]
                            final = ' '.join(words)
                            numFound += 1
                            yield utils.tokenize(resultfinal, lower=True)
Example #42
0
def create_bow_corpus(textFile, dictionary, outputDir):
    """"
    Creates a Gensim bag-of-words corpus from a Gensim dictionary and saves it
    
    textFile (string): Location of the specified text file used in dictionary generation
    dictionary (Gensim dictionary object): Dictionary object used in corpus generation
    outputDir (string): Location to save the corpus
    """
    outputFile = outputDir + 'DBLP_Corpus.mm'
    with open(textFile, encoding='utf-8') as (iFile):
        corpus = [
            dictionary.doc2bow((tokenize(line)), allow_update=True)
            for line in iFile
        ]
    corpora.MmCorpus.serialize(outputFile, corpus)
    print('Corpus created and stored at: ' + outputFile)
    return corpus
def get_text_content(root):
    path = './body/body.content/block[@class="full_text"]'
    full_text = root.find(path)
    if (full_text is None):
        return (None, 0)
    text = ''.join(full_text.itertext()).strip()
    # Ditch lead paragraph
    lines = text.split("\n")
    if lines[0].startswith('LEAD:'):
        lines = lines[1:-1]
    text = ' '.join(lines)
    # with open("corpora/scratch.txt", "a") as f:
    #     f.write(text)
    #     f.write("\n---\n")
    tokens = [t.lower() for t in tokenize(text)]
    wc = len(tokens)
    return (' '.join(tokens), wc)
Example #44
0
def body_topic(dataframe):
    text_body = dataframe['body'].values
    text_body = [remove_stopwords(body) for body in text_body]
    text_body = [
        tokenize(body, deacc="True", lowercase="True") for body in text_body
    ]
    text_body = [[snow.stem(token) for token in word_list]
                 for word_list in text_body]
    dataframe['tokens'] = [list(gen) for gen in text_body]
    dataframe['corpus'] = [
        dictionary.doc2bow(doc) for doc in dataframe['tokens']
    ]
    dataframe['predicted_topic'] = [
        probs_to_topic(topic_probs)
        for topic_probs in topic_model.get_document_topics(dataframe['corpus'])
    ]
    return dataframe
Example #45
0
def preprocess(content):
  """  params -: raw text scrapped from website
       return -: return list of words after:    
                1) tokenization
                2) remove stopwords and some insignificant words
                3) convert in lowercase 
                4) lemmatize 
                5) Remove common web terms """

  content = tokenize(content, deacc=True)
  content = list(filter(is_significant, content))
  content = [token.lower() for token in content]
  MIN_WORDS = 30  #minimum words needed to decide whether site is english or not
  if len(content) > MIN_WORDS and not is_english(content): return ['invalidcontentfound']   #signal for non_engish site 
  content = [lemmatize(token) for token in content if token not in STOPWORDS and token in dictionary]
  content = [token for token in content if token not in AVOID]
  return content
Example #46
0
def simple_preprocess(doc: str,
                      lower: bool = False,
                      deacc: bool = False,
                      min_len: int = 2,
                      max_len: int = 15) -> List[str]:
    r"""
	Gensim's simple_preprocess adding a 'lower' param to indicate wether or not to
	lower case all the token in the texts

	For more informations see: https://radimrehurek.com/gensim/utils.html
	"""
    tokens = [
        token
        for token in tokenize(doc, lower=False, deacc=deacc, errors='ignore')
        if min_len <= len(token) <= max_len and not token.startswith('_')
    ]
    return tokens
Example #47
0
def gen_vocab(tweets):
    vocab, reverse_vocab = {}, {}
    vocab_index = 1
    for tweet in tweets:
        text = tokenize(tweet.lower())
        text = ' '.join([c for c in text if c not in punctuation])
        words = text.split()
        words = [word for word in words if word not in STOPWORDS]
        for word in words:
            if word not in vocab:
                vocab[word] = vocab_index
                reverse_vocab[
                    vocab_index] = word  # generate reverse vocab as well
                vocab_index += 1
    vocab['UNK'] = len(vocab) + 1
    reverse_vocab[len(vocab)] = 'UNK'
    return vocab
Example #48
0
def lemmatize(content):
    """
	Use the English lemmatizer from `pattern` to extract tokens in
	their base form=lemma, e.g. "are, is, being" -> "be" etc.
	This is a smarter version of stemming, taking word context into account.

	Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
    """
    content = u' '.join(utils.tokenize(content, lower=True, errors='ignore'))
    parsed = parse(content, lemmata=True, collapse=False)
    result = []
    for sentence in parsed:
        for token, tag, _, _, lemma in sentence:
            if 2 <= len(lemma) <= 15 and not lemma.startswith('_'):
                if utils.ALLOWED_TAGS.match(tag):
                    result.append(lemma.encode('utf8'))
    return result
Example #49
0
def word2vec_classifier(dataset):
    documents = []
    for line in dataset:
        # Wrapper method for tokenizing with
        tokens = tokenize(line[0], lower=True)
        sentence = LabeledSentence(tokens, line[1])
        documents.append(sentence)
    log.info("Doc2Vec %d lines" % (len(documents)))
    # Model parameters
    num_features = 100
    min_word_count = 1
    num_workers = 8
    context = 2
    downsampling = 1e-3
    d2v_model = Doc2Vec(min_count=min_word_count,
                        window=context,
                        size=num_features,
                        sample=downsampling,
                        workers=num_workers)
    log.info("Training doc vectors")
    train_set, test_set = train_test_split(documents,
                                           train_size=0.7,
                                           test_size=0.3)
    train_vec = getAvgFeatureVecs(train_set, d2v_model, num_features)
    test_vec = getAvgFeatureVecs(test_set, d2v_model, num_features)
    train_vec = Imputer().fit_transform(train_vec)
    test_vec = Imputer().fit_transform(test_vec)

    # train model and predict with LinearSVC
    model = LinearSVC()
    classifier_fitted = OneVsRestClassifier(model).fit(train_vec, train_set[1])
    result = classifier_fitted.predict(test_vec)

    # output result to csv
    result.tofile("./d2v_linsvc.csv", sep='\t')

    # store the model to mmap-able files
    joblib.dump(model, 'model/%s.pkl' % 'd2v_linsvc')

    # evaluation
    label_score = classifier_fitted.decision_function(test_vec)
    binarise_result = label_binarize(result, classes=class_list)
    binarise_labels = label_binarize(class_list, classes=class_list)

    # generate_eval_metrics(binarise_result, 'w2v_linsvc', binarise_labels)
    generate_report(binarise_result, 'w2v_linsvc', binarise_labels)
def prepare_corpus(dirname, text_cutoff=1000000):
    underscore = re.compile(r'\_')
    authors, titles, texts = [], [], []
    for filename in sorted(glob.glob(dirname + "/*")):
        if '_' in filename:
            author, title = underscore.split(
                os.path.split(filename)[-1].replace(".txt", ""), maxsplit=1)
        else:
            author, title = next(DUMMY_AUTHORS), os.path.basename(
                filename).replace(".txt", "")
        authors.append(author)
        titles.append(title)
        with open(filename) as infile:
            texts.append(
                list(
                    islice(tokenize(infile.read(), lowercase=True, deacc=True),
                           0, text_cutoff)))
    return Dataset(texts, titles, authors)
def clean_text_by_word(text, deacc=True):
    """ Tokenizes a given text into words, applying filters and lemmatizing them.
    Returns a dict of word -> syntacticUnit. """
    text_without_acronyms = replace_with_separator(text, "",
                                                   [AB_ACRONYM_LETTERS])
    original_words = list(
        tokenize(text_without_acronyms, to_lower=True, deacc=deacc))
    filtered_words = [
        join_words(word_list, "")
        for word_list in preprocess_documents(original_words)
    ]
    if HAS_PATTERN:
        tags = tagger.tag(
            original_words)  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return dict((unit.text, unit) for unit in units)
Example #52
0
def tokenize_tr(content, token_min_len=2, token_max_len=50, lower=True):
    """tokenize words in the corpus
    """
    if lower:
        lower_map = {
            ord(u'A'): u'a',
            ord(u'A'): u'a',
            ord(u'B'): u'b',
            ord(u'C'): u'c',
            ord(u'Ç'): u'ç',
            ord(u'D'): u'd',
            ord(u'E'): u'e',
            ord(u'F'): u'f',
            ord(u'G'): u'g',
            ord(u'Ğ'): u'ğ',
            ord(u'H'): u'h',
            ord(u'I'): u'ı',
            ord(u'İ'): u'i',
            ord(u'J'): u'j',
            ord(u'K'): u'k',
            ord(u'L'): u'l',
            ord(u'M'): u'm',
            ord(u'N'): u'n',
            ord(u'O'): u'o',
            ord(u'Ö'): u'ö',
            ord(u'P'): u'p',
            ord(u'R'): u'r',
            ord(u'S'): u's',
            ord(u'Ş'): u'ş',
            ord(u'T'): u't',
            ord(u'U'): u'u',
            ord(u'Ü'): u'ü',
            ord(u'V'): u'v',
            ord(u'Y'): u'y',
            ord(u'Z'): u'z'
        }
        content = content.translate(lower_map)

    return [
        utils.to_unicode(token)
        for token in utils.tokenize(content, lower=False, errors='ignore')
        if token_min_len <= len(token) <= token_max_len
        and not token.startswith('_')
    ]
Example #53
0
def get_papar_words(author_papers):
    paper_character = []
    for i, paper in enumerate(author_papers):
        title = [word.lower() for word in tokenize(paper['title'])]
        abstract = []
        keywords = []
        text = []
        #        if 'abstract' in paper.keys() and paper['abstract'] is not None:
        #            abstract=[word.lower() for word in tokenize(paper['abstract'])]
        if 'keywords' in paper.keys() and paper['keywords'] is not None:
            keywords = [word.lower() for word in paper['keywords']]

        text = title + abstract + keywords  # 合并title,keywords,abstract
        text = [
            word for word in text if (word not in my_stopwords) and (
                word not in stopwords.words('english'))
        ]
        paper_character.append(text)
    return paper_character
Example #54
0
def preprocess_text(document):
    """
    Performs advanced preprocessing on a string and returns lemmatized list of tokens.

    :param document: Document string to be preprocessed
    :return: List of preprocessed tokens
    """

    stop_words = nltk.corpus.stopwords.words('english')
    stop_words.extend(CUSTOM_STOP_WORDS)
    en_stop = set(stop_words)
    tokens = utils.tokenize(document, lowercase=True, deacc=True)
    tokens = [
        str(token) for token in tokens
        if (token not in en_stop and not token.startswith('_'))
    ]
    tokens = [get_lemma2(token) for token in tokens]

    return tokens
Example #55
0
 def __iter__(self):
     for directory in CowReader.dirs:
         with codecs.open(os.path.join(CowReader.root, directory,
                                       directory + ".xml"),
                          encoding='utf-8') as infile:
             sentence = []
             for line in infile:
                 if line.startswith('<s'):
                     continue
                 elif line.startswith('</s>'):
                     yield sentence
                     sentence = []
                 else:
                     word, pos, lemma = line.strip().split('\t')
                     if pos not in ('$.', 'punc'):
                         sentence.append(word.lower())
     with codecs.open(CowReader.vvb, encoding='utf-8') as vvb:
         for sentence in vvb:
             yield list(tokenize(sentence, lowercase=True))
Example #56
0
    def prepare_text(self, plain_text):
        tokens = list(tokenize(plain_text))
        tokens = [x for x in tokens if x.lower() not in STOPWORDS]
        plain_text = " ".join(tokens)

        bigram_mdl = Phrases(tokens, min_count=1, threshold=2)
        custom_filters = [strip_punctuation, strip_numeric]
        tokens = preprocess_string(plain_text, custom_filters)
        tokens = [t for t in tokens if len(t) > 2]
        bigrams = bigram_mdl[tokens]
        words = list(bigrams)

        words = [re.sub('_', '-', word) for word in words]
        vecs = [
            self.word2vec[word] if word in self.word2vec.keys() else np.zeros(
                shape=(1, 20)) for word in words
        ]
        # return list of arrays, each array is  vector of a single word
        return vecs
Example #57
0
    def get_texts(self):
        stoplist = set('for a of the and to in'.split())  # add http?
        for fname in os.listdir(self.dirname):
            W = []
            print(os.path.join(self.dirname, fname))
            for line in io.open(os.path.join(self.dirname, fname),
                                'r',
                                encoding='windows-1252'):
                line = re.sub(' "source":(.[^,]+)",', '',
                              line)  # remove json.loads corrupters
                line = re.sub("(?<=[^a-z])(')(?=.)|(?<=.)(')(?=[^a-z])", '"',
                              line)
                w = json.loads(line)

                # tokenize and remove common words
                w = utils.tokenize(w['text'], lowercase=True)
                w = [word for word in w if word not in stoplist]

                W.extend(w)
            yield W
Example #58
0
def predict(text):
    x = np.zeros((1, timesteps, 300), dtype=np.float32)

    tokens = tokenize(text)

    mj = 0

    for w in tokens:
        if (mj < timesteps):

            try:
                x[0][mj] = w2v.word_vec(w)
                mj += 1
            except:
                continue

        else:
            break

    return model.predict(x)
Example #59
0
def clean_text_by_word(text, deacc=True):
    """Tokenize a given text into words, applying filters and lemmatize them.

    Parameters
    ----------
    text : str
        Given text.
    deacc : bool, optional
        Remove accentuation if True.

    Returns
    -------
    dict
        Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values.

    Example
    -------
    .. sourcecode:: pycon

        >>> from gensim.summarization.textcleaner import clean_text_by_word
        >>> clean_text_by_word("God helps those who help themselves")
        {'god': Original unit: 'god' *-*-*-* Processed unit: 'god',
        'help': Original unit: 'help' *-*-*-* Processed unit: 'help',
        'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'}

    """
    text_without_acronyms = replace_with_separator(text, "",
                                                   [AB_ACRONYM_LETTERS])
    original_words = list(
        tokenize(text_without_acronyms, to_lower=True, deacc=deacc))
    filtered_words = [
        join_words(word_list, "")
        for word_list in preprocess_documents(original_words)
    ]
    if HAS_PATTERN:
        tags = tag(join_words(
            original_words))  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return {unit.text: unit for unit in units}
Example #60
0
 def _extract_texts(self, content):
     # remove all \n
     # gensim's tokenizer and to lowercase
     # remove stop words
     # remove infrequent words
     try:
         title = content.get("title", "")
         comments = content.get("comments", [])
         comments = map(lambda x: x.replace("\n", " "), comments)
         raw_texts = "{} {}".format(title, "\n".join(comments))
         # tokenize
         # from nltk.tokenize import RegexpTokenizer
         # tokenizer = RegexpTokenizer('[a-zA-Z][a-zA-Z0-9]*')
         tokens = list(tokenize(raw_texts, lower=True, deacc=True))
         # stopwords
         clean_tokens = [t for t in tokens if t not in en_stopwords]
         # infrequent words
         texts = " ".join(clean_tokens)
         return texts
     except Exception as e:
         traceback.print_exc()