Ejemplo n.º 1
0
def import_data(a_file, a_row, b_file, b_row):
    a_content = []
    a_content_1 = open(a_file, 'r')
    csv_reader_a = csv.reader(a_content_1)
    for row in csv_reader_a:
        row_new = remove_stopwords(row[a_row])
        row_new = strip_numeric(row_new)
        row_new = strip_non_alphanum(row_new)
        row_new = strip_short(row_new, minsize=3)
        a_content.append(row_new)
    a_length = len(a_content)
    a_label = np.ones(a_length)
    a_label = a_label.tolist()

    b_content = []
    b_content_1 = open(b_file, 'r')
    csv_reader_b = csv.reader(b_content_1)
    for row in csv_reader_b:
        row_new = remove_stopwords(row[a_row])
        row_new = strip_numeric(row_new)
        row_new = strip_non_alphanum(row_new)
        row_new = strip_short(row_new, minsize=3)
        b_content.append(row_new)
    b_length = len(b_content)
    b_label = np.zeros(b_length)
    b_label = b_label.tolist()

    return a_content, a_label, b_content, b_label
Ejemplo n.º 2
0
def import_data(file):
    human = []
    machine = []
    content = open(file, 'r')
    csv_reader = csv.reader(content)
    for row in csv_reader:
        row1 = unicode(row[2], errors='ignore')
        row_new1 = remove_stopwords(row1)
        row_new1 = strip_numeric(row_new1)
        #row_new = strip_non_alphanum(row_new)
        row_new1 = strip_short(row_new1, minsize=3)
        human.append(row_new1)
        row2 = unicode(row[3], errors='ignore')
        row_new2 = remove_stopwords(row2)
        row_new2 = strip_numeric(row_new2)
        #row_new = strip_non_alphanum(row_new)
        row_new2 = strip_short(row_new2, minsize=3)
        machine.append(row_new2)

    length = len(human)
    human_label = np.ones(length)
    human_label = human_label.tolist()
    machine_label = np.zeros(length)
    machine_label = machine_label.tolist()

    return human, human_label, machine, machine_label
Ejemplo n.º 3
0
def clean_compute_similarity(d1, d2):

    #print(type(d1))
    #print(type(d2))

    d1 = remove_stopwords(d1).split()
    d2 = remove_stopwords(d2).split()

    #print(d1)
    #print(d2)

    # Dictionary and Corpus
    documents = [d1, d2]
    dictionary = corpora.Dictionary(documents)

    # Composing the similarity matrix
    similarity_matrix = fasttext_model300.similarity_matrix(dictionary,
                                                            tfidf=None,
                                                            threshold=0.0,
                                                            exponent=2.0,
                                                            nonzero_limit=100)

    # Conversion of sentences into bag-of-words vectors - The function doc2bow() simply counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector.
    d1 = dictionary.doc2bow(d1)
    d2 = dictionary.doc2bow(d2)

    #print(d1)
    #print(d2)

    # Soft cosine similarity - Considers similarities between pairs of features
    score = softcossim(d1, d2, similarity_matrix)

    return score
Ejemplo n.º 4
0
def preprocessing(text):
    '''Preprocesses a text using standard gensim techniques: 
    removes stopwords, strips short words (1-2 characters), strips numbers, 
    strips http addresses, strips Unicode from emoji etc., lowercases everything, 
    strips extra spaces, punctuation, non-alphanumeric symbols. Also perform stemming

    input: 
        text: a string
    returns: 
        the preprocessed string.
    '''
    text = text.lower()
    text = preprocess.remove_stopwords(text) # remove stop words
    text = preprocess.strip_short(text) #get rid of short words
    text = preprocess.strip_numeric(text) #get rid of numbers
    p = re.compile(r'(http.*\s)|(http.*$)')
    text = p.sub('',text)
    p = re.compile(r'[^\x00-\x7F]+')
    text = p.sub('',text)
    text = preprocess.strip_multiple_whitespaces(text)
    text = preprocess.strip_punctuation(text)
    text = preprocess.strip_non_alphanum(text)
    text = preprocess.remove_stopwords(text)
    text = preprocess.strip_short(text)
# stemming
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    text = ' '.join(stemmed_words)

    return text
Ejemplo n.º 5
0
def getTopics(question, answer):
    table = str.maketrans(dict.fromkeys(
        string.punctuation))  # OR {key: None for key in string.punctuation}
    new_q = question.translate(table)
    new_a = answer.translate(table)
    # print("q", new_q)
    # print("a", new_a)
    # print("astop", remove_stopwords(new_a))
    questionClean = remove_stopwords(new_q).lower().split()
    answerClean = remove_stopwords(new_a).lower().split()
    allWords = questionClean + answerClean
    # print("allWords", allWords)
    maxScore = 0
    maxTopic = ""
    maxWord = ""
    maxTopicScores = {topic: (0, "") for topic in allTopics}
    for topic in allTopics:
        for word in allWords:
            try:
                curScore = wv.similarity(word, topic)
            except KeyError:
                curScore = 0
            if curScore > maxScore:
                maxScore = curScore
                maxTopic = topic
                maxWord = word
            if maxTopicScores[topic][0] < curScore:
                maxTopicScores[topic] = (curScore, word)
    print(maxTopicScores)
    if maxScore > 0.15:
        print("maxscore", maxScore, maxWord)
        return [maxTopic]
    print("maxscore", maxScore, maxWord)
    return []
Ejemplo n.º 6
0
	def match(entity='', subject=''):
		
		tokenizer = RegexpTokenizer(r'\w+') 
		lemmatizer = WordNetLemmatizer()
		entity = [lemmatizer.lemmatize(e) for e in tokenizer.tokenize(remove_stopwords(entity).lower()) if all([not e.isnumeric(), not e[0].isnumeric(), len(e)>2])]
		subject = [lemmatizer.lemmatize(e) for e in tokenizer.tokenize(remove_stopwords(subject).lower()) if all([not e.isnumeric(), not e[0].isnumeric(), len(e)>2])]

		return not all([e not in subject for e in entity])
Ejemplo n.º 7
0
def readCorpus(fname, tokens_only=False, mode='w'):
    tokens = []
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if(mode == 's'):
                 tokens.append(split_sentences(remove_stopwords(line)))
            else:  # Train text with or without tags
                 tokens.append(gensim.utils.simple_preprocess(remove_stopwords(line)))
    return tokens
Ejemplo n.º 8
0
    def test_strip_stopwords(self):
        self.assertEqual(remove_stopwords("the world is square"),
                         "world square")

        # confirm redifining the global `STOPWORDS` working
        with mock.patch('gensim.parsing.preprocessing.STOPWORDS',
                        frozenset(["the"])):
            self.assertEqual(remove_stopwords("the world is square"),
                             "world is square")
Ejemplo n.º 9
0
def cosine_distance(a,b):
    a = remove_stopwords(a)
    b = remove_stopwords(b)
    a_avg = None
    b_avg = None
    for w in a.split(" "):
        a_avg = a_avg + model[w] if a_avg else model[w]
    for w in b.split(" "):
        b_avg = b_avg + model[w] if b_avg else model[w]
    a_avg /= len(a.split(" "))
    b_avg /= len(b.split(" "))
    return (1 - spatial.distance.cosine(a_avg, b_avg))
Ejemplo n.º 10
0
def string_processor(token):
    #     str = str(token)
    str = unidecode(token)
    str = remove_stopwords(str)
    str = strip_punctuation(str)
    str = remove_stopwords(str)
    #    str = strip_non_alphanum(str) # will rm all puncs
    tokens = sp(str)
    tokens = [token.lemma_ for token in tokens
              ]  # lemma_ replace all 'I' to '-PRON-', sorce code bug
    #    tokens = [porter_stemmer.stem(token) for token in tokens]
    str = " ".join(tokens)
    str = strip_multiple_whitespaces(str)
    str = str.strip(' ')
    return str
Ejemplo n.º 11
0
    def cohesiveness_between_chapters(self, document):
        '''
        Compute cohesiveness between chapters using latent semantic analysis
        :param document: document to be processed, a list of chapters.
        :return: cohesivess matrix
        '''
        document = [' '.join(chapter) for chapter in document]

        document = [remove_stopwords(chapter).split() for chapter in document]
        dictionary = corpora.Dictionary(document)

        corpus = [dictionary.doc2bow(chapter) for chapter in document]

        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]

        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
        corpus_lsi = lsi[corpus_tfidf]

        index = similarities.MatrixSimilarity(corpus_lsi)
        sims = index[corpus_lsi]
        # index = similarities.MatrixSimilarity(corpus_tfidf)// Similarity with tf-idf
        # sims = index[corpus_tfidf]
        #print(index)
        #print(sims)
        return sims
def scrub_stopwords(txt: str, lib_sw: str = None) -> str:
    """
    Removes stopwords from text using a choice of libraries.
    Reference:
        - https://medium.com/towards-artificial-intelligence/stop-the-stopwords-using-different-python-libraries-ffa6df941653 # noqa: E501

    :param txt: String to pass in to remove stopwords.
    :param lib_sw: String of the library to use to remove stopwords.
    :return: String that has had its stopwords removed.
    """
    if lib_sw is None:
        pass
    elif lib_sw == 'sklearn':
        txt = [word for word in txt.split() if word not in ENGLISH_STOP_WORDS]
        txt = ' '.join(txt)
        return txt
    elif lib_sw == 'nltk':
        txt = [word for word in txt.split() if word not in STOPWORDS_NLTK]
        txt = ' '.join(txt)
        return txt
    elif lib_sw == 'spacy':
        txt = [word for word in txt.split() if word not in STOPWORDS_SPACY]
        txt = ' '.join(txt)
        return txt
    elif lib_sw == 'gensim':
        txt = remove_stopwords(txt)
        return txt
    else:
        raise Exception(
            f"Sorry, entered library, {lib_sw}, is not recognised.\n" +
            "Please enter one from [None, 'sklearn', 'nltk', 'spacy', 'gensim']"
        )
Ejemplo n.º 13
0
def clean_sentence(sentence, stopwords=False):
    sentence = sentence.lower().strip()
    sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
    if stopwords:
        sentence = remove_stopwords(sentence)

    return sentence
 def send(self, s):
    tweet = json.loads(str(s))
    if not 'user' in tweet: return
    if not tweet['lang'] == 'en': return
    txt = re.sub("[^\w\s@#]+", '', tweet['text']).lower()
    txt = str(remove_stopwords(' '.join(sorted(txt.split()))))
    self.meth(txt)
    def process_review_raw_data(self):

        print("Review data pre-processing start...")

        _reviews = []

        with open(config.path2datasets + self.dataset_name, 'r') as f:
            for line in f.readlines():
                review_json = json.loads(line)

                _business_id = review_json['business_id']
                _review_id = review_json['review_id']
                _stars = review_json['stars']
                _text = review_json['text']

                # remove punctuation
                _text = strip_punctuation(_text)
                _text = remove_stopwords(_text)
                _text = _text.lower()

                _reviews.append({
                    'review_id': _review_id,
                    'business_id': _business_id,
                    'stars': _stars,
                    'text': _text
                })

        _reviews = pd.DataFrame(_reviews)
        _reviews.to_csv(config.path2data + self.dataset_name + "." +
                        config.path2reviews)
        _reviews = None

        print("Review data pre-processing DONE")
Ejemplo n.º 16
0
def wordcloud_auto(df):
    """
    Takes a df, and turns it into a word cloud, if possible. 
    The data frame must have a column named 'text' in order for this function to
    run properly.
    """

    if 'text' in df.columns:
        df['gs_remove'] = df.text.apply(lambda x: remove_stopwords(x))
        df['nlp'] = df.gs_remove.apply(lambda x: nlp(x))
        lemma = []

        for i in iter(df.nlp):
            for j in i:
                lemma.append(j.lemma_)

        STOPWORDS.add('PRON')
        stopwords = STOPWORDS

        wordcloud = WordCloud(stopwords=stopwords,
                              background_color='White',
                              width=1000,
                              height=500,
                              max_words=30).generate(' '.join(lemma))

        plt.figure(figsize=(24, 16))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.show()
    else:
        print(
            "Cannot locate the text column. Please use pd.dataframe.rename() to specify the column."
        )
Ejemplo n.º 17
0
    def _proc_sent(self,
                   sent,
                   rm_dialog,
                   rm_stop,
                   stem,
                   rm_short=None,
                   min_nw_sent=3):
        sent = sent.lower()
        sent = re.sub(r'\s+', ' ', sent).strip()  # remove extra spaces

        if not sent:
            return None

        if rm_short and len(nltk.tokenize.word_tokenize(sent)) < min_nw_sent:
            return None

        if rm_dialog:
            dialog_tokens = ["''", "``"]
            for tk in dialog_tokens:
                if tk in sent:
                    logger.info('Remove dialog')
                    return None

            if config.test_year == '2005' and sent[0] == "'" and (
                    'says' in sent or 'said' in sent):
                logger.info('Remove dialog')
                return None

        if rm_stop:
            sent = remove_stopwords(sent)

        if stem:
            sent = self.porter_stemmer.stem_sentence(sent)

        return sent
Ejemplo n.º 18
0
def index():
	index_id = request.json

	if not index_id:
		abort(400)
	
	posted_fields = index_id.keys()
	required_fields = {'id'}

	if not required_fields <= posted_fields:
		abort(400, f'Missing fields: {required_fields - posted_fields}')

	idx = index_id.get('id')
	timelines_req = requests.get('http://localhost:5100/timelines/id/'+idx)
	data = timelines_req.json()['data'][0]['text']
	lower = data.lower()
	for c in string.punctuation:
		lower = lower.replace(c, "")
	remove_sw = remove_stopwords(lower)
	tokens = remove_sw.split()

	for t in tokens:
		r.sadd(t, idx)

	response.status = 200

	return timelines_req.json()
def data_clean(path_to_data, path_to_label):
    '''
    Inputs:
    path_to_data: path to data.txt
    path_to_label: path to label.txt
    Outputs:
    A pandas dataframe with the preprocessed data with the respective category labels
    '''
    data = []
    df = pd.read_excel(path_to_label)
    with open(path_to_data) as file:
        for line in file:
            line = regex(line)
            # to remove stopwords
            line = remove_stopwords(line)
            data.append(line.strip().lower())
            if data[-1] == "------------------------------------------------" \
                           "------------------------------------------------------":
                del data[-1]
    string = ""
    privacy_preprocessed = []
    for item in data[1:]:
        if item != data[0]:
            string += item
        else:
            privacy_preprocessed.append(string)
            string = ""
    privacy_preprocessed.append(string)

    df['Privacy_Policies'] = privacy_preprocessed
    df["len"] = df["Privacy_Policies"].apply(lambda x: len(x))
    df.drop(df[df["len"] == 0].index, inplace=True)

    return df
Ejemplo n.º 20
0
def clean_documents(documents):
    documents_clean = []

    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)
    for d in documents:
        # Remove Unicode
        d = d.lower()
        # removing url,emoji,smiley,number
        document_test = p.clean(d)

        #remove stop_words
        document_test = remove_stopwords(document_test)

        document_test = re.sub(r'[^\x00-\x7F]+', ' ', document_test)
        # Remove Mentions
        document_test = re.sub(r'@\w+', '', document_test)
        # Lowercase the document

        # Remove punctuations
        document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ',
                               document_test)
        # Lowercase the numbers
        document_test = re.sub(r'[0-9]', '', document_test)
        # Remove the doubled space

        documents_clean.append(document_test)
    return documents_clean
def custom_preprocess(sentence):                         #Define a custom preprocess function for the test documents, this can also be applied to pandas dataframe series
  sentence = sentence.lower()
  no_stopwords = remove_stopwords(sentence)
  tokens = tokenize(no_stopwords)
  no_punctuation = strip_punctuation(no_stopwords)
  unwanted = remove_unwanted(no_punctuation)
  return unwanted
    def train_word2vec(self):
        if self.train_documents is None:
            self.prepare_train_documents()

        print("\t. Estimating Word2Vec model")            
        
        print("\t. Loading training documents")

        counter = 0
        all_docs = []
        for train_doc in self.train_documents:

            doc = train_doc[:150000] if len(train_doc) > 150000 else train_doc
            if (counter%100) == 0:
                print("{0} .. len: {1}".format(counter,len(doc)))

            counter += 1
            doc = remove_stopwords(doc)
#            doc = re.sub(r'[^\w\s]','',doc)
            doc_tokens =nltk.word_tokenize(doc.lower())
            all_docs.append(doc_tokens)            

        print("Creating all tagged documents")
        documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(all_docs)]    

        print("\t. Run model")
        model = Doc2Vec(documents = documents,
                        vector_size=700,
                        window=7,
                        min_count =3)
        print()
        print("\t. Done")
        self.word2vec_model = model
def clean_text(text):
    """ Cleans the text in the only argument in various steps 
    ARGUMENTS: text: content/title, string
    RETURNS: cleaned text, string"""
    if isfloat(text):
        try:
            if math.isnan(text):
                return ''
        except TypeError:
            print('text: {}'.format(text))
            return ''

    # Replace newlines by space. We want only one doc vector.
    text = text.replace('\n', ' ').lower()
    # Expand contractions: you're to you are and so on.
    # text = contractions.fix(text)
    # Remove stop words
    text = preprocessing.remove_stopwords(text)
    # Remove html tags and numbers: can numbers possible be useful?
    text = preprocessing.strip_tags(preprocessing.strip_numeric(text))
    # Remove punctuation -- all special characters
    text = preprocessing.strip_multiple_whitespaces(
        preprocessing.strip_punctuation(text))
    #text = re.sub(r'[^\w\s]', '', text.lower())
    # STEMMING (Porter) automatically lower-cases as well
    # To stem or not to stem, that is the question
    #text = preprocessing.stem_text(text)
    return text
Ejemplo n.º 24
0
def getLemmatizedText(name, content, language):
  language = language[:2]
  language = language.lower()
  outText = ""
  if (language):
    if (language=="is"):
      outText = getLemmatizedTextIS(name, content)
      print("IS")
    else:
      outText = lemmatizerMultilanguage.getLemmatizedText(language, name+" "+content)
      print(language.upper())
  else:
    text = name+" "+content
    outText = text.lower().replace('.','.')
    print("ERROR: No language for Lemmatizing text")
  cleaned = re.sub(' +', ' ',outText)
  cleaned = cleaned.replace('\n', '')
  cleaned = cleaned.replace('\r', '')

  cleaned = remove_stopwords(cleaned)
  cleaned = strip_tags(cleaned)
  cleaned = strip_punctuation(cleaned)
  cleaned = strip_numeric(cleaned)
  cleaned = strip_short(cleaned, 1)
  cleaned = strip_multiple_whitespaces(cleaned)
  cleaned = cleaned.lower()

  print("Lemmatized CLEAN: "+cleaned)
  return cleaned
Ejemplo n.º 25
0
    def __init__(self, df, gram=1, n_most=20):
        self.df = df
        self.ngrams = []
        self.index = []
        self.value = []

        for sentence in self.df.text:
            sentence = remove_stopwords(sentence)
            splitted = sentence.split(' ')

            for element in splitted:
                if element == '':
                    splitted.remove(element)

            while len(splitted) > (gram - 1):
                self.ngrams.append(tuple(splitted[0:gram]))
                splitted.pop(0)

        self.count = Counter(self.ngrams).most_common(n_most)

        for i in self.count:
            if len(self.count[0][0]) == 2:
                self.index.append('\n'.join([i[0][0], i[0][1]]))
                self.value.append(i[1])
            elif len(self.count[0][0]) == 3:
                self.index.append('\n'.join([i[0][0], i[0][1], i[0][2]]))
                self.value.append(i[1])
            else:
                print('Neither 2 nor 3')
                break
Ejemplo n.º 26
0
 def preprocess(rdd):
    """
    Pre-process tweets in rdd so they'll be
    suitable for use in the downstream topology
    """
    return rdd.map(
       # xform json into dicts
       lambda js: json.loads(js[1])
    ).filter(
       # analyze only tweets from users (skip "delete" messages, eg)
       lambda tweet: 'user' in tweet
    ).filter(
       # don't analyze our own tweets
       lambda tweet: tweet['user']['id_str'] != me
    ).filter(
       # english only
       lambda tweet: 'lang' in tweet and tweet['lang'] == 'en'
    ).map(
       # pluck out tweet's author & text & downcase tweet text
       lambda tweet: (tweet['user']['screen_name'], tweet['text'].lower())
    ).map(
       # kill punctuation, except for @mentions and #hashtags and spaces
       lambda t: (t[0], re.sub("[^\w\s@#]+", '', t[1]))
    ).map(
       # add text w/ stop words removed
       lambda t: (t[0], t[1], remove_stopwords(t[1]))
    ).map(
       # pprint() can only handle ascii, it seems
       lambda t: [ _.encode('ascii','ignore') for _ in t ]
    )
Ejemplo n.º 27
0
def clean(doc):
    for i in range(doc.shape[0]):
        #lowercasing
        doc.set_value(i, doc.iloc[i].lower())
    #print("LOWERCASE")
    #print(doc)
    for i in range(doc.shape[0]):
        #remove punctuation
        doc.set_value(i, re.sub(r'([^\s\w])+', '', doc.iloc[i]))
    #print("REMOVE PUNCT")
    #print(doc)
    for i in range(doc.shape[0]):
        #remove stopwords
        doc.set_value(i, remove_stopwords(doc.iloc[i]))
    #print("REMOVE STOPWORDS")
    #print(doc)
    for i in range(doc.shape[0]):
        #tokenize
        doc.set_value(i, word_tokenize(doc.iloc[i]))
    #print("TOKENIZE")
    #print(doc)
    for i in range(doc.shape[0]):
        #lemmatize
        for j in range(len(doc.iloc[i])):
            doc.iloc[i][j] = lemmatizer.lemmatize(doc.iloc[i][j])
Ejemplo n.º 28
0
def preprocess_tweet(tweet):
    """
    This function will preprocess the input tweet

    Steps for preprocessing:
        1. Lowercase the letters
        2. Replace the characters with frequency greater than 3 with 3 in a word
        3. Replace a url with Tag: <URLURL>
        4. Replace a tag mention: <UsernameMention>


    @TODO:
        1. Look for better preprocessing methods on the web
        2. Apply here
    """
    clean_tweet = tp.clean(tweet)

    # perform lemmatization
    tokenizer = TweetTokenizer()
    tweet_tokens = tokenizer.tokenize(clean_tweet)

    lemmatized_tweet = lemmatize_tweet(tweet_tokens)

    # remove stopwords
    preprocessed_tweet = remove_stopwords(lemmatized_tweet)
    return preprocessed_tweet
Ejemplo n.º 29
0
def write_discharge_summaries(out_file):
    notes_file = '%s/NOTEEVENTS.csv' % (MIMIC_3_DIR)
    print("processing notes file")
    with open(notes_file, 'r') as csvfile:
        with open(out_file, 'w') as outfile:
            print("writing to %s" % (out_file))
            outfile.write(
                ','.join(['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'TEXT']) +
                '\n')
            notereader = csv.reader(csvfile)
            #header
            next(notereader)
            i = 0
            for line in tqdm(notereader):
                subj = int(line[1])
                category = line[6]
                if category == "Discharge summary":
                    note = line[10]
                    #tokenize, lowercase and remove numerics
                    #tokens = [t.lower() for t in tokenizer.tokenize(note) if not t.isnumeric()]
                    #text = '"' + ' '.join(tokens) + '"'
                    text = remove_stopwords(udf_clean(note))
                    outfile.write(','.join([line[1], line[2], line[4], text]) +
                                  '\n')
                i += 1
    return out_file
Ejemplo n.º 30
0
def toSentences(pageList, language='English', keywords=None):
    # convert into long string (from list of page texts)
    longString = ''.join(pageList).replace('\n', ' ')

    # Remove Stop Words
    sentences_nostops = remove_stopwords(longString)

    # split into list of sentences
    sentences = nltk.sent_tokenize(sentences_nostops)

    if keywords:
        sentences = extractKeywordSentences(sentences, keywords)

    # Convert sentences to list of words
    data_words = list(sent_to_words(sentences))

    # Form Bigrams
    #sentences_bigrams = make_bigrams(data_words)
    sentences_bigrams = data_words

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en', disable=['parser', 'ner'])
    sentences_lemmatized = lemmatization(
        nlp, sentences_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    return sentences_lemmatized
Ejemplo n.º 31
0
def load_top_topics(docs=[], cnt_topics=5):

	docs = [remove_stopwords(doc) for doc in docs] #remove stopwords
	tokenizer = RegexpTokenizer(r'\w+') #=> https://www.kite.com/python/docs/nltk.RegexpTokenizer
	for i in range(len(docs)):
		docs[i] = docs[i].lower() #lower strings
		docs[i] = tokenizer.tokenize(docs[i]) #split strings into tokens
	docs = [[token for token in doc if not token.isnumeric() and not token[0].isnumeric()] for doc in docs] #exclude numbers
	docs = [[token for token in doc if len(token) > 1] for doc in docs] #exclude too short tokens
	lemmatizer = WordNetLemmatizer() #=> https://www.nltk.org/_modules/nltk/stem/wordnet.html
	docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] #group similar words

	dictionary = Dictionary(docs) #create dictionary
	corpus = [dictionary.doc2bow(doc) for doc in docs] #create corpus

	model = LdaModel( #=> https://radimrehurek.com/gensim/models/ldamodel.html
		corpus=corpus,
		id2word=dictionary,
		# chunksize=2000,
		# alpha='auto',
		# eta='auto',
		iterations=200,
		# passes=20,
		# eval_every=None,
		num_topics=cnt_topics
	)
		
	top_topics = model.top_topics(corpus) #[([(a, x), ..., (a, x)], a), ...]

	return top_topics
Ejemplo n.º 32
0
def freq_for_all():
    conn = sqlite3.connect('stonks.db')
    c = conn.cursor()
    c.execute("select text from posts where  text <> '[removed]'")
    total = c.fetchall()
    for i in total:
        for x in i:
            filtered = remove_stopwords(x)
            split = filtered.split()
            for z in split:
                if z in my_stop_words:
                    pass
                else:
                    word_list.append(z)

    conn.commit()
    conn.close()

    for word in word_list:
        d[word] = d.get(word, 0) + 1

    word_freq = []
    for key, value in d.items():
        word_freq.append((value, key))

    word_freq.sort()
    print(word)
Ejemplo n.º 33
0
def save_word_dict(text):
    proc_text = []

    sentences = text
    sentences = tokenize.sent_tokenize(sentences)

    for sentence in sentences:
        sentence_without_stops = remove_stopwords(sentence)
        sentence_without_stops = stem_text(sentence_without_stops)
        sentence_without_stops = strip_short(sentence_without_stops)
        sentence_without_stops = strip_punctuation(sentence_without_stops)

        proc_sentence = word_tokenize(sentence_without_stops.lower())

        if len(proc_sentence) == 0:
            continue
        proc_text.append(proc_sentence)

    dictionary = corpora.Dictionary(proc_text)
    return [dictionary, proc_text, sentences]
Ejemplo n.º 34
0
 def testStripStopwords(self):
     self.assertEqual(remove_stopwords("the world is square"), "world square")