Beispiel #1
0
def preprocess(text):
    #convert text to lower case
    text = text.lower()

    #removing whitespace
    text.strip()

    #removing digits
    text = gensim.parsing.preprocessing.strip_numeric(text)
    #text = ' '.join(s for s in text.split() if not any(c.isdigit() for c in s))

    #print(text)

    #remove stopwords
    text = gensim.parsing.preprocessing.remove_stopwords(text)

    #strip punctutation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)

    #strip multiple whitepsace that might occur after we remove stopwords
    text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text)

    p = PorterStemmer()

    text = ' '.join(p.stem(word) for word in text.split())

    #print(text)

    return text
Beispiel #2
0
def dataToXYListRead(fileName):
    with open(fileName) as file:
        porter_stemmer = PorterStemmer()
        lineCount = 0
        wordSentenceDbLi = []
        while True:
            line = file.readlines(1)
            if not line:
                break
            # if lineCount == 20:
            #     break
            jsonLine = json.loads(line[0])

            # noStopWords = remove_stopwords(jsonLine['text'])
            # stemWords = porter_stemmer.stem(noStopWords)
            stemWords = porter_stemmer.stem(jsonLine['text'])
            tokenWords = simple_preprocess(stemWords, deacc=True)

            # print(tokenWords)
            wordSentenceDbLi.append(tokenWords)
            lineCount += 1
        # yelpDic = corpora.Dictionary(wordSentenceDbLi)
        # yelpDic.save('yelpDictionary.dict')
        # print(yelpDic.token2id)
        # print(yelpDic[8])

        return wordSentenceDbLi
Beispiel #3
0
def preprocess(data, stem_data, remove_stopwords):
    processed = []
    stemmer = PorterStemmer()
    for file in data:

        # lowercasing all text

        file = str(file).lower()

        # removing non-alpha characters
        file = re.sub('[^a-zA-Z]', ' ', file)

        # tokenizing articles
        tokenized = word_tokenize(file)

        # removing stop words from tokens
        stop_removed_tokens = []
        if remove_stopwords:
            for word in tokenized:
                if word not in stop_words:
                    stop_removed_tokens.append(word)
        else:
            stop_removed_tokens = tokenized
        if stem_data:
            stemmed = []
            for token in stop_removed_tokens:
                stemmed.append(stemmer.stem(token))
            processed.append(stemmed)
        else:
            processed.append(stop_removed_tokens)
    return processed
Beispiel #4
0
def stem_text(text):
    """
    Return lowercase and (porter-)stemmed version of string `text`.
    """
    text = utils.to_unicode(text)
    p = PorterStemmer()
    return ' '.join(p.stem(word) for word in text.split())
Beispiel #5
0
def token_stem(text):

    tokens = simple_preprocess(text, deacc=True)

    porter_stemmer = PorterStemmer()
    stem_tokens = [porter_stemmer.stem(word) for word in tokens]

    return stem_tokens
Beispiel #6
0
def stem_text(text):
    """
    Return lowercase and (porter-)stemmed version of string `text`.
    """
    p = PorterStemmer()
    return ' '.join(
        p.stem(word) for word in
        text.lower().split())  # lowercasing required by the stemmer
Beispiel #7
0
def document_preprocess(text):
    p = PorterStemmer()
    first = text.encode('ascii', 'ignore').decode('utf-8').lower()
    second = preprocessing.remove_stopwords(first)
    third = preprocessing.strip_punctuation(second)
    fourth = preprocessing.strip_short(preprocessing.strip_numeric(third))
    fifth = p.stem(fourth)
    return fifth
Beispiel #8
0
 def find_documents(self, term, stemming=False):
     stemmer = PorterStemmer()
     if stemming:
         term = stemmer.stem(term)
     term_id = self.get_id_for_term(term)
     if term_id < 0:
         return set()
     docs = self.get_related_documents(term_id)
     return set(docs)
Beispiel #9
0
 def cleanText(self, textToClean):
     textLower = str(textToClean).lower()
     englishText = "".join(
         [char for char in textLower if char in string.printable])
     textNoPunc = "".join(
         [char for char in englishText if char not in string.punctuation])
     textStop = remove_stopwords(textNoPunc)
     porter = PorterStemmer()
     textStemmed = porter.stem(textStop)
     return (textStemmed.split())
def spimi_invert(
    files: List[str],
    stemmer: PorterStemmer,
    blocks_dir: str,
    memory_available: int,
) -> List[str]:
    """SPIMI-Invert procedure.

    Collect terms, docIDs, term-frequencies into a block (dictionary
    of dictionaries) that fits in available memory, write each block's
    dictionary to disk, and start a new dictionary for the next block.

    Args:
        files: List of filepaths.
        stemmer: Gensim porter stemmer.
        blocks_dir: Directory where blocks are saved.
        memory_available: Available memory in bytes.

    Returns:
        List of filenames of saved blocks.

    """
    memory_used = 0
    outputed_blocks = []
    block_index = 0
    dictionary = {}
    for docId, token in token_stream(files):
        memory_used += sys.getsizeof(token)

        term = stemmer.stem(token)
        if term not in dictionary.keys():
            dictionary[term] = {}
        if docId not in dictionary[term].keys():
            dictionary[term][docId] = 0
        dictionary[term][docId] += 1  # save term freq. in document

        if memory_used > memory_available:
            # Sort terms and write to disk
            with shelve.open(blocks_dir + "block" + str(block_index)) as f:
                for k in sorted(dictionary.keys()):
                    f[k] = dictionary[k]
            outputed_blocks.append("block" + str(block_index))
            block_index += 1
            memory_used = 0
            dictionary = {}

    # Save last block
    if dictionary:
        with shelve.open(blocks_dir + "block" + str(block_index)) as f:
            for k in sorted(dictionary.keys()):
                f[k] = dictionary[k]
        outputed_blocks.append("block" + str(block_index))
    return outputed_blocks
Beispiel #11
0
def processing(body_text):
    p = PorterStemmer()
    stopset = set([
        'doi', 'preprint', 'copyright', 'org', 'https', 'et', 'al', 'author',
        'figure', 'table', 'rights', 'reserved', 'permission', 'use', 'used',
        'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.',
        'Elsevier', 'PMC', 'CZI', '-PRON-', 'usually', r'\usepackage{amsbsy',
        r'\usepackage{amsfonts', r'\usepackage{mathrsfs',
        r'\usepackage{amssymb', r'\usepackage{wasysym',
        r'\setlength{\oddsidemargin}{-69pt', r'\usepackage{upgreek',
        r'\documentclass[12pt]{minimal'
    ])
    cStopwords = STOPWORDS.union(stopset)
    resultlist = []
    for text in body_text:
        tokens = []
        for item in gensim.parsing.preprocess_string(text):
            if item not in cStopwords:
                p.stem(item)
                tokens.append(item)
        yield model.infer_vector(tokens)
def build_name_index(docs: List[str], stemmer: PorterStemmer) -> None:
    """Build index from list of song names.

    Args:
        docs: List of filenames.
        stemmer: Gensim porter stemmer.

    """
    index_names = defaultdict(dict)
    for docId, doc in enumerate(docs):
        for token in pretty_doc(doc).split():
            term = stemmer.stem(token)
            index_names[term][docId] = 1
    with shelve.open("index_names") as index:
        index.update(index_names)
Beispiel #13
0
def load_data(tweets_tsv, tweets_postag):
    """
  Return tweets id,user id,tweets label,raw tweets,tokenized tweets,
  tweets in PoS, PoS tagged tweets and stemmed tweets in a pandas Dataframe.
  
  :param tweets_tsv: <SID><tab><UID><tab><CLASS><tab><TWITTER_MESSAGE>
  :parm tweets_postag: ark-TweetNLP `./runTagger.sh --output-format conll --input-formt txt --input-field 4` 
  :rtype: pandas.DataFrame
  """

    o = open(tweets_tsv, 'r', encoding='utf-8').readlines()
    p = open(tweets_postag).read()

    raw = p.split('\n\n')
    raw_pos_data = [line.split('\n') for line in raw]
    pos_data = []
    for tweet in raw_pos_data:
        pos_data.append([tuple(word_pos.split('\t')) for word_pos in tweet])

    stemmer = PorterStemmer()

    data = {}
    for idx, line in enumerate(o):
        tweet_id, user_id, adr, text = line.split('\t')
        data[tweet_id] = {}
        data[tweet_id]['user_id'] = user_id
        data[tweet_id]['adr'] = adr
        data[tweet_id]['raw_text'] = text
        data[tweet_id]['stem_text'] = [
            stemmer.stem(w_pos[0]) for w_pos in pos_data[idx]
        ]
        data[tweet_id]['tok_text'] = [w_pos[0] for w_pos in pos_data[idx]]
        data[tweet_id]['pos_token'] = [w_pos[1] for w_pos in pos_data[idx]]
        data[tweet_id]['pos_text'] = [
            '#'.join(list(w_pos)) for w_pos in pos_data[idx]
        ]

    df = pd.DataFrame.from_dict(data, orient='index')

    df.adr = df.adr.astype('int')
    df.user_id = df.user_id.astype('int')

    logger.info("Loaded dataframe from {0} and {1}".format(
        tweets_tsv, tweets_postag))
    logger.info("Dataframe information:\n")
    df.info()

    return df
Beispiel #14
0
def preprocess(file_name, number_of_documents):
    stemmer = PorterStemmer()
    fp1 = open("preprocessed.txt", "wb")
    fp2 = open("preprocessed-cmptext.txt", "wb")
    pickle.dump(number_of_documents, fp1)
    for line in file_name:
        preprocess_list1 = gensim.utils.simple_preprocess(line, max_len=20)
        preprocess_list2 = []
        for word in preprocess_list1:
            if word not in stop_words:
                preprocess_list2.append(word)
        pickle.dump(stemmer.stem_documents(preprocess_list2), fp1)
        for word in preprocess_list2:
            fp2.write(stemmer.stem(word.encode('utf-8')))
            fp2.write(' ')
        fp2.write('\n')
    fp1.close()
    fp2.close()
Beispiel #15
0
def stem_text(text):
    """Transform `s` into lowercase and stem it.

    Parameters
    ----------
    text : str

    Returns
    -------
    str
        Unicode lowercased and porter-stemmed version of string `text`.

    Examples
    --------
    >>> from gensim.parsing.preprocessing import stem_text
    >>> stem_text("While it is quite useful to be able to search a large collection of documents almost instantly.")
    u'while it is quit us to be abl to search a larg collect of document almost instantly.'

    """
    text = utils.to_unicode(text)
    p = PorterStemmer()
    return ' '.join(p.stem(word) for word in text.split())
Beispiel #16
0
def stem_text(text):
    """Transform `s` into lowercase and stem it.

    Parameters
    ----------
    text : str

    Returns
    -------
    str
        Unicode lowercased and porter-stemmed version of string `text`.

    Examples
    --------
    >>> from gensim.parsing.preprocessing import stem_text
    >>> stem_text("While it is quite useful to be able to search a large collection of documents almost instantly.")
    u'while it is quit us to be abl to search a larg collect of document almost instantly.'

    """
    #text = utils.to_unicode(text)
    p = PorterStemmer()
    return ' '.join(p.stem(word) for word in text.split())
Beispiel #17
0
    def __init__(self, path):
        """Load the downloaded corpus.

        Parameters
        ----------
        path : string
            Path to the extracted zip file. If 'summaries-gold' is in a folder
            called 'opinosis', then the Path parameter would be 'opinosis',
            either relative to you current working directory or absolute.
        """
        # citation
        path = os.path.join(path, "summaries-gold")
        dictionary = Dictionary()
        corpus = []
        stemmer = PorterStemmer()

        for directory, b, filenames in os.walk(path):
            # each subdirectory of path is one collection of reviews to a specific product
            # now get the corpus/documents
            for filename in filenames:
                filepath = directory + os.sep + filename
                # write down the document and the topicId and split into train and testdata
                with open(filepath) as file:
                    doc = file.read()

                preprocessed_doc = [
                    stemmer.stem(token)
                    for token in re.findall(r'\w+', doc.lower())
                    if token not in STOPWORDS
                ]

                dictionary.add_documents([preprocessed_doc])
                corpus += [dictionary.doc2bow(preprocessed_doc)]

        # and return the results the same way the other corpus generating functions do
        self.corpus = corpus
        self.id2word = dictionary
Beispiel #18
0
from gensim.parsing.porter import PorterStemmer

sentence = [
    "This", "sentence", "was", "transformed", "using", "Porter", "Stemmer"
]
porterStemmer = PorterStemmer()

print(" ".join([porterStemmer.stem(word) for word in sentence]))
Beispiel #19
0
    def clean(self, stopfile=None, startindex=0, stem=False):

        #need a filename
        if self.__raw_text == None:
            raise FileNotFoundError('No raw text file provided')

        print('Cleaning raw text data...', end='', flush=True)

        #rename class level cleaned text filename to one created here
        cleaned_text = '{}_cleaned.txt'.format(self.__raw_text[:-4])

        #initialize stopwords. default is nltk stopwords
        if stopfile == None:
            stops = set(stopwords.words('english'))
        elif os.path.isfile(stopfile):
            with codecs.open(stopfile, 'r', encoding='utf-8',
                             errors='ignore') as f:
                stops = set([word.strip().lower() for word in f.readlines()])
        else:
            raise Exception('Stopfile not found')

        #this regex object will remove all punctuation
        regex = re.compile(r'[^a-zA-Z0-9\s]|[\_\^\`\[\]\\]', re.IGNORECASE)

        #clean the review file
        with codecs.open(self.__raw_text,
                         'r',
                         encoding='utf-8',
                         errors='ignore') as f:
            with open(cleaned_text, 'w') as cleaned:

                t1 = time.time()

                #go through every line in the file
                for line in f:

                    #remove non-alphanumeric symbols
                    line = regex.sub(' ', line)

                    #split into tokens and ignore stopwords
                    if stem:
                        stemmer = PorterStemmer()
                        tokens = [
                            stemmer.stem(word.lower().strip())
                            for word in line.split(' ') if word not in stops
                        ]
                    else:
                        tokens = [
                            word.lower().strip() for word in line.split(' ')
                            if word not in stops
                        ]

                    #remove empty elements from the list
                    tokens = [word for word in tokens if word != '']

                    #ignore elements before start index
                    tokens = tokens[startindex:]

                    #write cleaned data to file
                    if len(tokens) > 0:
                        cleaned.write('{}\n'.format(' '.join(tokens)))

                t2 = time.time()

        #update the text file name
        self.__raw_text = cleaned_text

        print('done')

        #display time it took to do all of this
        print('Raw text cleaned in {} minutes.'.format(int((t2 - t1) / 60)))
Beispiel #20
0
def postprocess_words(words):
    for i in range(len(words)):
        p = PorterStemmer()
        words[i] = p.stem(words[i])
    return words
Beispiel #21
0
class nlp_engine:
    def __init__(self, ):
        # self.tokenizer   = BertTokenizer.from_pretrained("bert-base-uncased")
        # self.model       = BertModel.from_pretrained("bert-base-uncased")
        self.use_coref = False
        self.vectorizer = data.vectorize('fast')
        self.stemmer = PorterStemmer()

    def make_multiple_choice(self, word, sentence, ai=False):
        if (len(word.split(' ')) == 1):
            if (word in sentence.split(' ')):
                most = self.vectorizer.most_similar(word.lower(), topn=20)
                choices = [x[0].lower() for x in most]
                tmp = list()
                stems = list()

                tmp.append(word.lower())
                stems.append(self.stemmer.stem(word.lower()))
                for x in choices:
                    stem = self.stemmer.stem(x.replace('.', ''))
                    if (stem not in stems):
                        stems.append(stem)
                        tmp.append(x)
                return {
                    "type": 'mc',
                    "question": sentence.replace(word, '______'),
                    "answer": tmp[:4]
                }
            else:
                return None
        else:
            return None

    def fill_in_blank(self, word, sentence):
        if (len(word.split(' ')) == 1):
            if (word in sentence.split(' ')):
                return {
                    "type": 'fb',
                    "question": sentence.replace(word, '______'),
                    "answer": word
                }
            else:
                return None
        else:
            return None

    def __call__(self, context):
        context_doc = nlp(context)
        ents = context_doc.ents
        sentences_doc = [x.text for x in context_doc.sents]
        self.use_coref = USE_COREF and context_doc._.has_coref

        sentence_lengths = [len(sentences_doc[0])]
        for i in range(1, len(sentences_doc)):
            sentence_lengths.append(sentence_lengths[i - 1] +
                                    len(sentences_doc[i]))

        ner_spans = list()
        for ent in ents:  #use ner
            for i in range(len(sentence_lengths)):
                if (ent.start_char < sentence_lengths[i]):
                    ner_spans.append((ent.text, sentences_doc[i]))

        nn_spans = list()
        if (self.use_coref):
            for token in context_doc:
                if ((token.pos_ == 'PROPN' or token.pos_ == 'NOUN')
                        and token._.in_coref):
                    for cluster in token._.coref_clusters:
                        nn_spans.append((token.text, cluster.main.text))

        sa_pairs = dict()
        for a, s in ner_spans + nn_spans:
            if (a not in sa_pairs):
                sa_pairs[a] = [s]
            elif (s != sa_pairs[a]):
                sa_pairs[a].append(s)

        qa_pairs = list()
        mc_pairs = list()
        for w, sents in sa_pairs.items():
            for s in sents:
                o = self.fill_in_blank(w, s)
                if (o != None):
                    qa_pairs.append(o)
                o = self.make_multiple_choice(w, s, False)
                if (o != None):
                    qa_pairs.append(o)
                    mc_pairs.append(o)

        print(len(qa_pairs))
        print(mc_pairs)
        return qa_pairs
Beispiel #22
0
class Indexer:
    """Class that implements querying and printing results.

    Attributes:
        root: Directory where songs lyrics is.
        docs: List of documents filenames, that is used to get docID.
        word_count: Length of each document.
        stemmer: Gensim porter stemmer.
        index: Index file descriptor.

    """
    def __init__(self,
                 docs: List[str],
                 index_path: str,
                 root: str = "lyrics/") -> None:
        """Initialize Indexer by assigning attributes and opening index file.

        Args:
            docs: List of documents filenames.
            index_path: Path to index file.
            root: Directory where songs lyrics is.

        """
        self.root = root
        self.docs = docs
        self.stemmer = PorterStemmer()
        self.get_word_count()
        self.index = shelve.open(index_path)

    def get_word_count(self) -> None:
        """Get length of each document."""
        self.word_count = []
        for _, doc in enumerate(self.docs):
            with open(self.root + doc, "r") as f:
                self.word_count.append(sum(len(line.split()) for line in f))

    def tfidf(self, posting: Dict[int, int]) -> List[Posting]:
        """Calculate tf-idf for documents in posting list.

        Args:
            posting: Posting list with term frequences of some term.

        Returns:
            List of (docID, tf-idf score), sorted by docID.

        """
        return [(k,
                 v / self.word_count[k] * log2(len(self.docs) / len(posting)))
                for k, v in sorted(posting.items())]

    def query_boolean(self, tokens: List[str]) -> List[Posting]:
        """Recursively parse boolean query in DNF.

        Args:
            tokens: List of tokens.

        Returns:
            List of (docID, tf-idf score) of query hits.

        """
        try:
            split_idx = tokens.index("OR")
            return or_postings(
                self.query_boolean(tokens[:split_idx]),
                self.query_boolean(tokens[split_idx + 1:]),
            )
        except ValueError:
            pass
        try:
            split_idx = tokens.index("AND")
            return and_postings(
                self.query_boolean(tokens[:split_idx]),
                self.query_boolean(tokens[split_idx + 1:]),
            )
        except ValueError:
            pass
        try:
            split_idx = tokens.index("NOT")
            return not_postings(self.query_boolean(tokens[split_idx + 1:]),
                                len(self.docs))
        except ValueError:
            pass
        term = self.stemmer.stem(tokens[0])
        try:
            posting = self.tfidf(self.index[term])
        except KeyError:
            return []
        return posting

    def render_file(self,
                    tokens: List[str],
                    filename: str,
                    offset: int = 20) -> None:
        """Print song name and text snippet.

        Args:
            tokens: List of query tokens.
            filename: Song filename.
            offset: How much to extend text snippet in symbols.

        """
        # Print band and song name
        print("\033[4m{}\033[0m:".format(pretty_doc(filename)))
        # Try to find term in song text
        with open(self.root + filename) as f:
            text = "".join(f.readlines())
            lowered_text = text.lower()
            for token in tokens:
                try:
                    w = self.stemmer.stem(token)
                    w_match = re.search(r"\b{}\w*\b".format(w), lowered_text)
                    l_match = re.search(r"\b{}.*?\n".format(w), lowered_text)
                    if w_match.start() > offset:
                        print("...", end="")
                    start = max(0, w_match.start() - offset)
                    print("{}\033[1m{}\033[0m{}".format(
                        text[start:w_match.start()],
                        text[w_match.start():w_match.end()],
                        text[w_match.end():l_match.end() - 1],
                    ))
                except AttributeError:
                    print("-")

    def render(self, tokens: List[str], hits: List[Posting],
               count: int) -> None:
        """Print the results of query.

        Args:
            tokens: List of query tokens.
            hits: Query results as a list of (docID, tf-idf score).
            count: How many hits to print.

        """
        if not hits:
            print("Nothing found")
            return
        tokens = [t for t in tokens if t not in ["AND", "OR", "NOT"]]
        print("{} hits found.\n".format(len(hits)))
        for docId, v in hits[:count]:
            print("[relevance = {:.3f}]".format(v))
            self.render_file(tokens, self.docs[docId])
            print()

    def query(self, query: str, count: int = 10) -> None:
        """Query index and print results, sorted by tf-idf.

        Args:
            query: Query string.
            count: How many hits to print.

        """
        tokens = query.split()
        hits = self.query_boolean(tokens)
        hits = sorted(hits, key=lambda item: item[1], reverse=True)
        self.render(tokens, hits, count)

    def close(self) -> None:
        """Close index file."""
        self.index.close()
Beispiel #23
0
def stem_text(text):
    """
    Return lowercase and (porter-)stemmed version of string `text`. 
    """
    p = PorterStemmer()
    return ' '.join(p.stem(word) for word in text.lower().split()) # lowercasing required by the stemmer
Beispiel #24
0
 def stem_text(self, text):
     text = utils.to_unicode(text)
     p = PorterStemmer()
     return ' '.join(p.stem(word) for word in text.split())
Beispiel #25
0
	def normalize(tokens):
		stemmer = PorterStemmer()
		return [stemmer.stem(word) for word in tokens]