Example #1
0
def remove_stopwords(words):
    stopwords = get_stopwords()
    #     all_words = [x for x in words]
    all_words = [re.sub(r'[^\w\s]', '', x)
                 for x in words]  # remove punctuation
    all_words = [x for x in all_words if x not in stopwords]
    return all_words
Example #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--dialogue_file',
        default='../data/subtitles/subtitlesInTSV/finding_nemo_clean.tsv')
    parser.add_argument('--data_dir',
                        default='../data/subtitles/subtitlesInTSV/')
    args = parser.parse_args()
    dialogue_file = args.dialogue_file
    data_dir = args.data_dir
    sub_name = os.path.basename(dialogue_file).replace('.tsv', '')
    print(sub_name)
    custom_words = [
        'will', 'don', 've', 're', 'oh', 'hey', 'ha', 'aah', 'll', 'can',
        'dont', 'just'
    ]
    stops = get_stopwords('en') + custom_words
    data = pd.read_csv(dialogue_file, sep='\t')
    all_docs = {}
    for slice_, data_group in data.groupby('slice'):
        clean_dialogue = []
        for d in data_group['dialogue']:
            # print('raw dialogue %s'%(d))
            # cleaned = clean_text(str(d))
            try:
                cleaned = d.decode('utf-8')
                clean_dialogue.append(cleaned)
            except Exception, e:
                print('could not clean text %s because error %s' % (d, e))
        all_dialogue = ' '.join(clean_dialogue)
        all_docs[slice_] = all_dialogue
Example #3
0
File: test_.py Project: gaaragots/e
def tokeniser(desc_text):
    return [
        PorterStemmer().stem(token) for token in wordpunct_tokenize(
            re.sub('[%s]|\w*\d\w*' %
                   re.escape(string.punctuation), '', desc_text.lower()))
        if token.lower() not in get_stopwords()
    ]
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--tf',
        # default='../../data/frequency/2015_2016_tf.tsv')
        default='../../data/frequency/2015_2016_tf_norm.tsv')
    parser.add_argument('--top_k', type=int, default=100000)
    args = parser.parse_args()
    tf_file = args.tf
    top_k = args.top_k
    print(tf_file)
    timeframe = re.findall('201[0-9]_201[0-9]', tf_file)[0]
    tf = pd.read_csv(tf_file, sep='\t', index_col=0)
    totals = tf.sum(axis=1)
    totals.sort_values(inplace=True, ascending=False)
    stops = set(get_stopwords('en'))
    # only want valid words!!
    valid_words = list(
        filter(lambda w: (type(w) is str and w.isalpha()) and w not in stops,
               totals.index))
    top_vocab = totals.loc[valid_words][:top_k]
    top_vocab = pd.DataFrame(top_vocab, columns=['count'])
    print('got %d vocab' % (len(top_vocab)))
    # renormalize
    top_vocab.loc[:,
                  'count'] = top_vocab.loc[:,
                                           'count'] / top_vocab.loc[:,
                                                                    'count'].sum(
                                                                        axis=0)
    out_dir = os.path.dirname(tf_file)
    out_fname = os.path.join(out_dir,
                             '%s_top_%d_vocab.tsv' % (timeframe, top_k))
    top_vocab.to_csv(out_fname, sep='\t')
def remove_stopwords(words):
    translator = str.maketrans('', '', punctuation)
    stopwords = get_stopwords()
    all_words = [
        word.translate(translator) for word in words if word not in stopwords
    ]
    return all_words
def frequence_word(list):
    ###Test unitaire fait


    #prend en entrée la liste résultante de la recherche d'un domaine de la fction analyse
    #ici, on va déterminer récupérer la fréquence de chaque mot dans la liste
    nb_frequence = {}
    mots_inutiles= stopwords.get_stopwords('fr')+stopwords.get_stopwords('en')
    for word in list :
        #on ne garde que les mots d'une longueur plus gde que 3 pour qu'ils aient vraiment un sens et
        #on ne garde as les mots vides
        if word not in mots_inutiles and len(word)>3:

            if word in nb_frequence:
                nb_frequence[word]+=1
            else:
                nb_frequence[word]=1
    return(nb_frequence)
def do_process(args):
    sc = SparkContext(appName="task")
    sc.addPyFile("generate_key.py")

    stop_words = get_stopwords(args.lang)

    rdd = sc.textFile(args.dump)

    rdd_processed = rdd.flatMap(
        lambda x: map_preprocess_wikidata(x, args.lang, stop_words))

    rdd_processed.saveAsSequenceFile(args.output)
def text_pre_processing(csvFile, columnNumberForText):
    # import data-set
    # colNum becomes an index, which should start at 0, and columns in spreadsheets start at 1, so subtract 1 from columnNumberForText
    documents = importColumnFromCSV(fileName=csvFile, colNum=int(columnNumberForText) - 1, header=True)
    print "imported documents..."

    # phrase detection model training
    abstracts = []  # list of abstracts containing a list of words
    for line in documents:
        # tokenize abstract
        tokens = nltk.word_tokenize(remove_non_ascii(line))
        abstracts.append(tokens)

    # create bigram and trigram phrase models
    bigram = models.Phrases(abstracts)
    trigram = models.Phrases(bigram[abstracts])
    print "built bigram and trigram phrase detection models..."

    # text pre-processing tools
    stops = get_stopwords('en')  # stronger stopwords
    STOPS = list(' '.join(str(e).title() for e in stops).split()) # uppercase stopwords
    noNum = re.compile(r'[^a-zA-Z ]')  # number and punctuation remover

    # function that cleans the text
    def clean(text):
        clean_text = noNum.sub(' ', text)               # remove numbers and punctuations
        tokens = nltk.word_tokenize(clean_text)         # tokenize text
        filtered_words = [w for w in tokens if not w in stops]      # filter out lowercase stopwords
        double_filtered_words = [w for w in filtered_words if not w in STOPS]    # filter out uppercase stopwords

        trigrams = trigram[bigram[double_filtered_words]]   # apply the bigram and trigram models to the filtered words
        trigrams_str = ' '.join(str(x) for x in trigrams)   # stringify clean and filtered tokens
        return trigrams_str

    results = []  # create list for storing clean abstracts

    # figure out path for the text corpus
    rawFilePathBase = os.path.basename(csvFile)
    rawFileName = os.path.splitext(rawFilePathBase)[0]
    corpusPath = "../../data/" + rawFileName + "_textCorpus.txt"

    # write list of clean text documents to text corpus file
    with open(corpusPath, 'w') as f:
        print 'Cleaned up text corpus file has been created at ', corpusPath, ' ...'
        f.truncate()        # if file is not empty, remove everything inside the file
        for abstract in documents:
            text = clean(abstract)      # clean each abstract, one at a time
            f.write(text + '\n')        # write clean abstract to desired text corpus file
            results.append(text)        # append clean abstracts to list
    return results, corpusPath          # return a list of clean abstracts
Example #9
0
    def preprocess(self, s):
        # extract all tokens build with specific words for tweets
        tokens = self.tokenize(s)

        # gather stop words
        punctuation = list(string.punctuation)
        stop = stopwords.get_stopwords() + punctuation

        # return tokens without stop words
        tokens = [
            token.lower() for token in tokens if token.lower() not in stop
        ]

        return tokens
Example #10
0
    def __init__(self, name=None, level=None, **kwargs):
        super(SimilarityCache, self).__init__(**kwargs)
        self.name = name
        self.confdLevel = level
        self.docuCommonWords = [
            'principal', 'discharge', 'diagnosis', 'responsible', 'after',
            'study', 'causing', 'admission', 'same', 'other', 'record',
            'orders', 'end', 'conditions', 'infections', 'complications',
            'diet', 'service', 'admission', 'date', 'limited', 'need', 'felt',
            'month', 'day', 'years', 'service', 'full', 'code', 'status',
            'medications', 'entered', 'order', 'summary', 'will', 'none',
            'summary:', 'home', 'year', '~', 'liter', 'status:', 'know', '?'
        ]

        self.stop_words = get_stopwords('en')
def __stem_doc(doc_details):
    # Import nltk tools
    from nltk.tokenize import wordpunct_tokenize as wordpunct_tokenize
    # from nltk.stem.snowball import EnglishStemmer
    from nltk.stem.porter import PorterStemmer as EnglishStemmer
    idx, doc = doc_details
    if idx % 100 == 0:
        print "Processed doc " + str(idx)
    if doc.endswith('.txt'):
        d = open(doc).read()
        stemmer = EnglishStemmer()  # This method only works for english documents.
        # Stem, lowercase, substitute all punctuations, remove stopwords.
        attribute_names = [stemmer.stem(token.lower()) for token in wordpunct_tokenize(
            re.sub('[%s]' % re.escape(string.punctuation), '', d.decode(encoding='UTF-8', errors='ignore'))) if
                           token.lower() not in stopwords.get_stopwords()]
        s.dump(attribute_names, open(doc.replace(".txt", ".p"), "wb"))
Example #12
0
def do_process(args):
    sc = SparkContext(appName="task")
    stop_words = get_stopwords(args.lang)

    sections_by_title = extract_sections(args.dump)

    dataset = []
    for title, sections in sections_by_title.items():
        for section in sections:
            key = generate_key(stop_words, title, section)
            value = "{}#{}".format(title, section)
            if key != "":
                dataset.append((key, value))

    rdd = sc.parallelize(dataset)
    rdd.saveAsSequenceFile(args.output)
def calculate_similarity(query, text, model_path):
    embeddingSize = 300  
    query_embedding =np.zeros((1,embeddingSize))  
    stop = stopwords.get_stopwords('english')
    model = word2vec.load(model_path)
    query_embedding = get_embedding(query, model, stop, query_embedding)
    
    nword=0
    score = 0.0
    for word in nltk.tokenize.word_tokenize(text.decode('utf8')):
        if word in model and word not in stop:
            nword += 1
            wordNorm = linalg.norm(model[word])
            score += np.dot(query_embedding, model[word]) / wordNorm
    
    if nword!=0:
        score = score / nword
    print score[0]
    return score[0]
Example #14
0
def calculate_similarity(query, text, model_path):
    embeddingSize = 300
    query_embedding = np.zeros((1, embeddingSize))
    stop = stopwords.get_stopwords('english')
    model = word2vec.load(model_path)
    query_embedding = get_embedding(query, model, stop, query_embedding)

    nword = 0
    score = 0.0
    for word in nltk.tokenize.word_tokenize(text.decode('utf8')):
        if word in model and word not in stop:
            nword += 1
            wordNorm = linalg.norm(model[word])
            score += np.dot(query_embedding, model[word]) / wordNorm

    if nword != 0:
        score = score / nword
    print score[0]
    return score[0]
Example #15
0
def pos_tag_tweets(name):
    # Remove stopwords
    stop_words = set(stopwords.get_stopwords())
    a = {
        ',', '!', '#', '%', ':', '+', '.', '@', '-', '&', '?', '\"', '\'', '(',
        ')', '\'', '`'
    }
    stop_words.update(a)

    #get all the tweets and tokenize them
    tokenized_tweets = []
    candidate_tweets = candidate.tweets(name)
    temp = word_tokenize(candidate_tweets)

    for t in temp:
        if t not in stop_words:
            tokenized_tweets.append(t)
    tagged_tweets = nltk.pos_tag(tokenized_tweets)

    return tagged_tweets
Example #16
0
def liste_mots(tweets):
    #prend en entrée une liste de texte de tweets, que l'on met dans un seul string
    text = ''
    mots_inutiles = stopwords.get_stopwords('en')
    mystopwords = ['I', 'will', '\'', 'The', 'http']
    for tweet in tweets:
        text += str(tweet)
    liste = TextBlob(text)
    #wordlist fait une liste des mots de tous les tweets
    wordlist = liste.words
    unique = []
    for word in wordlist:
        w = Word(word)
        #on récupère les mots qui ne sont pas des mots vides et qui sont dans leur forme de base +
        #qui sont plus long que 3 carac
        if word not in mots_inutiles and word not in mystopwords and len(
                word) > 2:
            word_lemmatize = w.lemmatize()
            unique.append(word_lemmatize)
        ###renvoie les mots dans une LISTE
    return (unique)
def __stem_doc(doc_details):
    # Import nltk tools
    from nltk.tokenize import wordpunct_tokenize as wordpunct_tokenize

    # from nltk.stem.snowball import EnglishStemmer
    from nltk.stem.porter import PorterStemmer as EnglishStemmer

    idx, doc = doc_details
    if idx % 100 == 0:
        print "Processed doc " + str(idx)
    if doc.endswith(".txt"):
        d = open(doc).read()
        stemmer = EnglishStemmer()  # This method only works for english documents.
        # Stem, lowercase, substitute all punctuations, remove stopwords.
        attribute_names = [
            stemmer.stem(token.lower())
            for token in wordpunct_tokenize(
                re.sub("[%s]" % re.escape(string.punctuation), "", d.decode(encoding="UTF-8", errors="ignore"))
            )
            if token.lower() not in stopwords.get_stopwords()
        ]
        s.dump(attribute_names, open(doc.replace(".txt", ".p"), "wb"))
Example #18
0
def get_tweets(username):

	forbidden_words=["https", "RT", "en", "lo", "de", "the", "a", ""]
	# http://tweepy.readthedocs.org/en/v3.1.0/getting_started.html#api
	auth = tweepy.OAuthHandler(config.BaseConfig.CONSUMER_KEY, config.BaseConfig.CONSUMER_SECRET)
	auth.set_access_token(config.BaseConfig.ACCESS_KEY, config.BaseConfig.ACCESS_SECRET)
	api = tweepy.API(auth)

	# set count to however many tweets you want
	number_of_tweets = 100

	# get tweets
	tweets_for_csv = []
	words = ''
	for tweet in tweepy.Cursor(api.user_timeline, screen_name = username).items(number_of_tweets):
		words += str(tweet.text.encode("utf-8"), 'utf-8')

	stop_words = get_stopwords()
	word_tokens = words.split(" ")
	filtered_sentence = [ w for w in word_tokens if w not in stop_words and "@" not in w and "https" not in w ]
	words = ' '.join(filtered_sentence)

	return words
Example #19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir',
                        default='../data/subtitles/subtitlesInTSV/')
    args = parser.parse_args()
    data_dir = args.data_dir
    dialogue_files = [
        f for f in os.listdir(data_dir) if re.findall('S[1-9]E[0-9]+.tsv', f)
    ]
    dialogue_files = [os.path.join(data_dir, f) for f in dialogue_files]
    stops = get_stopwords('en') + [
        'will',
        'don',
        've',
    ]
    all_docs = {}
    for f in dialogue_files:
        ep_name = re.findall('S[1-9]E[0-9]+', f)[0]
        data = pd.read_csv(f, sep='\t')
        docs = []
        for chunk, data_group in data.groupby('chunk'):
            clean_dialogue = []
            for d in data_group['dialogue']:
                # print('raw dialogue %s'%(d))
                cleaned = clean_text(str(d))
                try:
                    cleaned = cleaned.decode('utf-8')
                    clean_dialogue.append(cleaned)
                except Exception, e:
                    print('could not clean text %s because error %s' %
                          (cleaned, e))
            all_dialogue = ' '.join(clean_dialogue)
            docs.append(all_dialogue)
        episode_text = ' '.join(docs)
        # print('got full text %s'%
        #       (episode_text))
        all_docs[ep_name] = episode_text
Example #20
0
def normalisasi2(pos_texts, neg_texts, kamus_hasil):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    stopwords = get_stopwords()

    pos_texts_normalized = []

    for text in pos_texts:
        pos_text_normalized = []

        for word in text.split():
            # normalisasi
            word = kamus_hasil[word]
            if word not in stopwords:
                word = stemmer.stem(word)
                if word not in stopwords:
                    pos_text_normalized.append(word)

        pos_texts_normalized.append(' '.join(pos_text_normalized))

    neg_texts_normalized = []

    for text in neg_texts:
        neg_text_normalized = []

        for word in text.split():
            # normalisasi
            word = kamus_hasil[word]
            if word not in stopwords:
                word = stemmer.stem(word)
                if word not in stopwords:
                    neg_text_normalized.append(word)

        neg_texts_normalized.append(' '.join(neg_text_normalized))

    return pos_texts_normalized, neg_texts_normalized
Example #21
0
    def initFeatures(self):
        ### text vectorization--go from strings to lists of numbers
        # words to exclude
        exclusion = stopwords.get_stopwords('english')

        # vectorizer
        vectorizer = TfidfVectorizer(stop_words=exclusion)
        jokes = preprocessJoke(self.DATA_LIMIT, jokeData)
        jokeLabels = labelData(jokes, 1)  # 1 is joke
        print ("Joke Labels length:", len(jokeLabels))

        tweets = preprocessNormalTweets(self.DATA_LIMIT, normalTweetData)
        tweetLabels = labelData(tweets, 0)  # 0 is not joke
        print ("Tweet labels length:", len(tweetLabels))

        # concat joke_labels and tweet_labels
        training_labels = jokeLabels + tweetLabels
        print ("training label length:", len(training_labels))

        # concat features
        training_features = jokes + tweets
        print ("training feature length:", len(training_features))
        transformedFeatures = vectorizer.fit_transform(training_features).toarray()
        return transformedFeatures, training_labels, vectorizer
Example #22
0
    return tokens_re.findall(s)


def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [
            token if emoticon_re.search(token) else token.lower()
            for token in tokens
        ]
    return tokens


print(preprocess(sentence))

stop = set(stopwords.get_stopwords('english'))
#stop = set(stopwords.words('english'))

tweets_data_path = '/Users/priyamurthy/Documents/PycharmProjects/program1/twitter_data.txt'

tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet['text'])
    except:
        continue
print 'Im here'
#i=0
#while i<len(tweets_data):
Example #23
0
from text_to_data import Doc2Data
from text_to_data import CalculatePair
from model import train_model, get_data
from progress.bar import Bar
import os
from corpy.udpipe import Model
import stopwords
from random import randint
import numpy as np

m = Model("russian-syntagrus-ud-2.5-191206.udpipe")
stop = stopwords.get_stopwords('ru')
# postfix can be _sm for demo corpus and _med for second part, _all for all corpus
postfix = '_all'

text_folder_name = 'texts' + postfix + '/'
data_folder_name = 'data' + postfix + '/'


def make_data_from_texts():
    # goes through folder and process all texts in json
    all_texts = os.listdir(text_folder_name)
    for text in Bar(' text parsing...').iter(all_texts):
        Doc2Data(text_folder_name + text, m, stop, data_folder_name)


def make_pairs(authors):
    all_texts = os.listdir(text_folder_name)
    texts = open('db' + postfix + '.csv', 'r').read().split('\n')[:authors]
    text = []
    for i in texts:
from orderedset._orderedset import OrderedSet

import json
import string
import re
from stopwords import get_stopwords
from tika import language
stopwords = get_stopwords("en")
stopwords = [x.upper() for x in stopwords]

#freqListFile = open("/Users/charanshampur/solr/lucene_solr_4_10/solr/example/solr-webapp/webapp/MyHtml/freqList.json","w")
freqListFile = open("freqList.json", "w")
sweetJsonFile = open(
    "/Users/charanshampur/PycharmProjects/CSCI599/MetaScoreNew.json", "r")
jsonLoad = json.load(sweetJsonFile)
langFile = open("Language.json", "r")
langDictionary = json.load(langFile)
removeWords = [
    "FOR", "LOGIN", "SALE", "NEW", "FREE", "``", "BUY", "SYSTEM", "WANT",
    "REPORT", "WITHIN", "S", "...", "TO", "SAN", "P", "W/", "ALL", "'S", "W",
    "M", "PAGE", "ITEMS"
]
#print "NLTK succesfully loaded<br>"
#print "Json succesfully loaded"
wordCloud = {}
skipList = [
    "NER_DATE", "id", "Geographic_LATITUDE", "content", "title",
    "Measurements", "Meta_Score", "NER_PERCENT", "NER_MONEY", ""
]

Example #25
0
from nltk.tokenize import RegexpTokenizer
from stopwords import get_stopwords
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import sys

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stopwords('en')
a = [
    '/', 's', 'n', ' ', 'may', '9', '11', '2016', 'brooklyn', 'cruise',
    'terminal', 'brooklyn', 'york', 'thus', 'far', 'disrupt', 'ny', 've',
    'seen', 'handshake', 'stalemate', 'iab', 'randall', 'rothenberger',
    'adblocker', 'till', 'faida', 've', 'seen', 'submit', 'startup', 'careers',
    'contact', 'us', 'privacy', 'policy', 'disclaimer', 'activate', 'facebook',
    'messenger', 'news', 'bot', 'subscribed', 'bot', 'will', 'send', 'digest',
    'trending', 'stories', 'day', 'can', 'also', 'customize', 'types',
    'stories', 'sends', 'click', 'button', 'subscribe', 'wait', 'new',
    'facebook', 'message', 'tc', 'messenger', 'news', 'bot', 'thanks', 'tc',
    'team', 'cost', 'costs', 'text', 'com', 'we', 'users', 'user', 'people',
    'global', 'you', 'city', 'state', 'country'
]

do_not_include = [
Example #26
0
def get_default_stopwords():
    return get_stopwords('en')
from stopwords import get_stopwords
from bs4 import BeautifulSoup

from emoji_processing import replace_hidden_emoji
from message_reactions import delete_reaction_end

stopwords = get_stopwords()


def read_file(filepath=None, text=None):
    if filepath:
        with open(filepath, "r") as f:
            lines = f.readlines()
    elif text:
        lines = [text]
    else:
        raise Exception("Neither text or filepath was entered")

    soup = BeautifulSoup(lines[0], "html.parser")
    names = [
        n.text
        for n in soup.findAll("div", {"class": "_3-96 _2pio _2lek _2lel"})
    ]
    messages = [m.text for m in soup.findAll("div", {"class": "_3-96 _2let"})]
    times = [t.text for t in soup.findAll("div", {"class": "_3-94 _2lem"})]

    names.reverse()
    times.reverse()
    messages.reverse()

    return list(zip(names, times, messages)), names, times, messages
Example #28
0
                    #Appending the articles
                    url.append(urls)
                    title.append(titles)
                    dop.append(dops)
                    content.append(texts)

                    #Condition for reading in the 25000 articles
                    ten_thousand+=1

            except:
                count+=1


#Tokenising
tokenizer = RegexpTokenizer(r'[a-zA-Z]{3,}')
english_stopwords = get_stopwords('en')
english_stopwords.append('reuters')
english_stopwords.append('said')
token_content = []
processed_content = []
for article in content:
    tokens = tokenizer.tokenize(article.lower())
    token_content.append(tokens)
    stopped_tokens = [i for i in tokens if i not in english_stopwords]
    processed_content.append(stopped_tokens)


# Creating a bigram model
bigram = models.Phrases(token_content, min_count=5, threshold = 100)
bigram_mod = models.phrases.Phraser(bigram)
bigram_content = [bigram_mod[i] for i in processed_content]
    HAS_NLTK = True
except Exception, exception:
    HAS_NLTK = False
    print "word stemmer is turned off"\
        "download nltk at http://www.nltk.org/ for feature"
from stopwords import get_stopwords

HAS_NLTK = False
wn_lemmatizer = None

if HAS_NLTK:
    # create stemmer
    wn_lemmatizer = WordNetLemmatizer()

# get list of stopwords
stop_words = get_stopwords()
# punctuation
EndPunctuationSet = set(".,?!()[]`%'\"")
StartPunctuationSet = set("\"([&^*<@`")


def reversed_dict(index):
    """ reverse keys and values """
    return dict((v, k) for k, v in index.iteritems())


def open_text(filename):
    """opens single file with multiple documents, separated by <TEXT> tag"""
    wlist = []
    groups = []
    with open(filename, 'r') as tfile:
N_SLICES = 60
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--sub_file',
        default='../data/subtitles/subtitlesInTSV/finding_nemo_clean.tsv')
    parser.add_argument('--LIWC_dir',
                        default='/hg191/corpora/LIWC/resources/liwc_lexicons/')
    args = parser.parse_args()
    sub_file = args.sub_file
    LIWC_dir = args.LIWC_dir
    LIWC_categories = [
        'positive_affect', 'negative_affect', 'anger', 'death', 'family',
        'home', 'humans', 'social', 'percept', 'insight'
    ]
    stopwords = get_stopwords('en')
    LIWC_category_wordlists = {
        c: [
            re.compile('^' + l.strip() + '$')
            for l in open(os.path.join(LIWC_dir, '%s' % (c)), 'r')
            if l.strip() not in stopwords
        ]
        for c in LIWC_categories
    }
    # replace positive/negative affect
    LIWC_categories += ['positive', 'negative']
    LIWC_categories.remove('positive_affect')
    LIWC_categories.remove('negative_affect')
    LIWC_category_wordlists['positive'] = LIWC_category_wordlists.pop(
        'positive_affect')
    LIWC_category_wordlists['negative'] = LIWC_category_wordlists.pop(
Example #31
0
def main():
    # if GPU is availale, use GPU
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print("Use " + str(device))

    # Load the training dataset, and create a dataloader to generate a batch.自动处理小写,计算长度
    textField = data.Field(
        lower=True,
        include_lengths=True,
        batch_first=True,
        preprocessing=preprocessing,  # 单词形式下的预处理,过去式之类的去除
        postprocessing=postprocessing,
        stop_words=get_stopwords())  # 剔除stopwords中的所有单词
    labelField = data.Field(sequential=False, use_vocab=False, is_target=True)

    dataset = data.TabularDataset('train.csv', 'csv', {
        'text': ('text', textField),
        'target': ('target', labelField)
    })

    textField.build_vocab(
        dataset, vectors=config.wordVectors)  # 把数据转换为向量,用上面定义的textfield

    # 分割数据集,训练集与验证集
    train_dataset, validate_dataset = dataset.split(
        split_ratio=config.proportion_of_val_dataset,
        stratified=True,
        strata_field='target')

    train_loader, val_loader = data.BucketIterator.splits(
        (train_dataset, validate_dataset),
        shuffle=True,
        batch_size=config.batchSize,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True)

    net = get_model(config.dim, config.from_old_model,
                    config.model_path).to(device)

    criterion = config.criterion

    params = net.parameters()
    # create optimizer
    if config.optimizer_name == "SGD":
        optimizer = toptim.SGD(params, lr=config.learning_rate)
    elif config.optimizer_name == "Adam":
        optimizer = toptim.Adam(params, lr=config.learning_rate)
    elif config.optimizer_name == "AdamW":
        optimizer = AdamW(params, lr=config.learning_rate, weight_decay=1e-6)

    # 混合精度加速
    if config.use_apex:
        net, optimizer = amp.initialize(net, optimizer, opt_level="O1")

    train_start = time.time()

    for epoch in range(config.epochs):
        '''
        # change lr by epoch
        adjust_learning_rate(optimizer, epoch)
        '''

        # start train
        train(net, train_loader, config.criterion, optimizer, epoch, device,
              log, textField)

        # start val
        val(net, val_loader, config.criterion, optimizer, epoch, device, log,
            train_start, textField)

    print("Final saved model is epoch " + str(best_val_acc[0]) + ", acc: " +
          str(best_val_acc[1]) + ".")
    log.write("Final saved model is epoch " + str(best_val_acc[0]) +
              ", acc: " + str(best_val_acc[1]) + "\n")

    print("Done.")
    log.write("Done.\n")
Example #32
0
import pickle

import warnings

warnings.filterwarnings('ignore')
import os
import sys

np.set_printoptions(threshold=sys.maxsize)

from time import time

from utils import preprocess
from stopwords import get_stopwords

STOPWORDS = get_stopwords()

t = time()

# READ IN REVIEWS
print('Loading Dataset...')
reviews = pd.read_csv('data/amazon_reviews_us_Electronics_v1_00.tsv',
                      sep='\t',
                      error_bad_lines=False)
reviews = reviews.iloc[:1000]
print('Dataset Loaded: ', round(time() - t, 2), 's')
print("Full Size:", reviews.shape[0], ' reviews')

# DROP USELESS ROWS
print('Cleaning dataframe...')
E_simple = reviews[[
Example #33
0
def stopper(testo):
    stop_words = set(stopwords.get_stopwords('english'))
    result = [i.lower() for i in testo if i.lower() not in stop_words]
    return result
from orderedset._orderedset import OrderedSet

import json
import string
import re
from stopwords import get_stopwords
from tika import language
stopwords=get_stopwords("en")
stopwords=[x.upper() for x in stopwords]


#freqListFile = open("/Users/charanshampur/solr/lucene_solr_4_10/solr/example/solr-webapp/webapp/MyHtml/freqList.json","w")
freqListFile = open("freqList.json","w")
sweetJsonFile=open("/Users/charanshampur/PycharmProjects/CSCI599/MetaScoreNew.json","r")
jsonLoad=json.load(sweetJsonFile)
langFile = open("Language.json","r")
langDictionary=json.load(langFile)
removeWords=["FOR","LOGIN","SALE","NEW","FREE","``","BUY","SYSTEM","WANT","REPORT","WITHIN","S","...","TO","SAN","P","W/","ALL","'S","W","M","PAGE","ITEMS"]
#print "NLTK succesfully loaded<br>"
#print "Json succesfully loaded"
wordCloud={}
skipList=["NER_DATE","id","Geographic_LATITUDE","content","title","Measurements","Meta_Score","NER_PERCENT","NER_MONEY",""]

def reduceList(nestedList):
    MainList=[]
    def subList(x):
        if type(x) is list:
            for item in x:
                subList(item)
        else:
            MainList.append(x)