def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.lancaster_stemmer = LancasterStemmer()
        self.porter_stemmer = PorterStemmer()
        self.snowball_stemmer = SnowballStemmer('english')

        wn.ensure_loaded()
Example #2
0
def main():
    global bow_corpus
    global word_to_idx
    global users
    wn.ensure_loaded()
    if NEW_CORPUS:
        bow_corpus = build_bow_corpus(get_users())
        save_corpus(bow_corpus)
    else:
        bow_corpus = get_corpus()

    bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2]
    word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)}
    idx_to_word = {idx: k for k, idx in word_to_idx.items()}

    users = get_users()

    results = []
    N = 0
    # for chik, svdk in exp:
    #    r= []
    #for N in range(15):

    results.append(truth_prediction_for_users(users, idx_to_word, 10000, 20,
                                              N))
    print(np.average(np.asarray(results), axis=1))
Example #3
0
 def __init__(self):
     self.wnl = WordNetLemmatizer()
     wn.ensure_loaded()
     self.contractions = {
         'isn\'t': ['is', 'not'],
         'aren\'t': ['are', 'not'],
         'wasn\'t': ['was', 'not'],
         'weren\'t': ['were', 'not'],
         'don\'t': ['do', 'not'],
         'doestn\'t': ['does', 'not'],
         'didn\'t': ['did', 'not'],
         'can\'t': ['cannot'],
         'we\'re': ['we', 'are'],
         'i\'m': ['I', 'am'],
         'it\'s': ['it', 'is'],
         'haven\'t': ['have', 'not'],
         'hasn\'t': ['has', 'not'],
         'hadn\'t': ['had', 'not'],
         'couldn\'t': ['could', 'not'],
         'mightn\'t': ['might', 'not'],
         'mustn\'t': ['must', 'not'],
         'shan\'t': ['shall', 'not'],
         'mayn\'t': ['may', 'not'],
         'shouldn\'t': ['should', 'not'],
         'won\'t': ['will', 'not'],
         'wouldn\'t': ['would', 'not'],
         'daren\'t': ['dare', 'not'],
         'needn\'t': ['need', 'not'],
         'usedn\'t': ['use', 'not'],
         'let\'s': ['let', 'us'],
         'you\'ve': ['you', 'have'],
         'i\'ve': ['I', 'have'],
     }
Example #4
0
    def __init__(self):
        #Ensuring that the wordnet corpus is loaded, so we can support multithreading
        wn.ensure_loaded()

        self.lemmatizer = wn_stem.WordNetLemmatizer()
        self.lemmas_dict = {}
        self.synsets_dict = {}
        self.similarity_dict = {}
 def init():
     nonlocal cached_stopwords
     try:
         wordnet.ensure_loaded()
         cached_stopwords = set(stopwords.words("english"))
     except LookupError:
         nltk.download("punkt")
         nltk.download("stopwords")
	def build_set(self):
		wn.ensure_loaded()  # `LazyCorpusLoader` conversion into `WordNetCorpusReader` starts
		print ("WordNet loaded")
		swn.ensure_loaded()  # `LazyCorpusLoader` conversion into `SentiWordNetCorpusReader` starts
		print ("SentiWordNet loaded")
		self.tweet_tokenizer = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False)
		print ("Tweet tokenizer loaded")
		self.it_tokenizer = MosesTokenizer(lang='it')
		print ("Moses tokenizer loaded")
		self.it_tagger = treetaggerwrapper.TreeTagger(TAGLANG="it", TAGDIR=flags.tagger_path)
		# self.en_tagger = treetaggerwrapper.TreeTagger(TAGLANG="en", TAGDIR=flags.tagger_path)
		print ("Tagger loaded")
		self.stop_words = set(stopwords.words('italian'))
		print ("Stopwords loaded")
		self.lexicon = lm.LexiconSent('it')
		print ("OpeNER lexicon loaded")
		self.emoji = self.get_emoji_sentiment_lexicon(flags.emoji_sentiment_lexicon)
		print ("Emoji sentiment lexicon loaded")
		self.translator = Translator()
		print ("Setting up support dictionaries")
		self.translated_lemma_tokens = self.load_obj(flags.translated_lemma_tokens)
		self.lexeme_sentiment_dict = self.load_obj(flags.lexeme_sentiment_dict)
		print ("Translator loaded")
		# Build test annotations
		print ("Building test annotations..")
		test_set = self.load_obj(flags.test_annotations)
		if not test_set:
			test_set = self.get_annotations(flags.test_set_path)
			self.save_obj(test_set, flags.test_annotations)
		print ("Test annotations built")
		# Build training annotations
		print ("Building training annotations..")
		training_set = self.load_obj(flags.training_annotations)
		if not training_set:
			training_set = self.get_annotations(flags.training_set_path)
			self.save_obj(training_set, flags.training_annotations)
		print ("Training annotations built")
		print ("Saving support dictionaries")
		self.save_obj(self.translated_lemma_tokens, flags.translated_lemma_tokens)
		self.save_obj(self.lexeme_sentiment_dict, flags.lexeme_sentiment_dict)
		# Build distributional docvec from training and test sets
		self.doc2vec = self.build_distributional_docvec([test_set, training_set])
		print ("Doc2Vec built")
		self.add_context_to_annotations(test_set)
		print ("Distr. docvec added to test annotations")
		self.add_context_to_annotations(training_set)
		print ("Distr. docvec added to training annotations")
		self.free_ram()
		print ("Loading pre-trained model..")
		self.model = ft.load_model(flags.word2vec_path)
		print ("Pre-trained model loaded")
		self.add_wordvecs_to_annotations(test_set)
		print ("Wordvecs added to test annotations")
		self.add_wordvecs_to_annotations(training_set)
		print ("Wordvecs added to training annotations")
		# Save to npy
		self.free_ram()
		self.save_obj({"test_set":test_set, "training_set":training_set}, flags.preprocessed_dict)
def feature_pred(features, chik, ldak):
    global users
    wn.ensure_loaded()
    facts = gt.get_fact_topics(DIR)

    if NEW_DATA:
        users = gt.get_users(DIR)
        transactions = gt.get_transactions(DIR)
        print(transactions.describe())

        tr_hsh = transactions['fact'].values
        # if castillo: comment cond2 out
        cond = facts['hash'].isin(tr_hsh)
        cond2 = facts['true'] == 1 | facts['true'] == 0
        facts = facts[cond & cond2]
        facts = Parallel(n_jobs=num_jobs)(delayed(get_features)(
            fact, transactions[transactions['fact'] == fact['hash']], [
                u for u in users if int(u.user_id) in list(transactions[
                    transactions['fact'] == fact['hash']]['user_id'].values)
            ]) for idx, fact in facts.iterrows())
        facts = pd.DataFrame(facts)
        with open('model_data/feature_data', 'wb') as tmpfile:
            pickle.dump(facts, tmpfile)
    else:
        with open('model_data/feature_data', 'rb') as tmpfile:
            facts = pickle.load(tmpfile)

    print(facts[list(features)].describe())
    X = facts[list(features)].values
    y = facts['y'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    std_clf = make_pipeline(StandardScaler(), PCA(n_components=ldak),
                            SVC(C=1, gamma=1))
    std_clf.fit(X_train, y_train)
    pred_test_std = std_clf.predict(X_test)
    precision, recall, fscore, sup = precision_recall_fscore_support(
        y_test, pred_test_std, average='macro')
    score = metrics.accuracy_score(y_test, pred_test_std)
    print("Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" %
          (score, precision, recall, fscore))
    acc_scores = cross_val_score(std_clf, X, y, cv=3)
    pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3)
    re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3)
    f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3)
    print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" %
          (acc_scores.mean(), acc_scores.std() * 2))
    print("\t Cross validated Precision: %0.3f (+/- %0.3f)" %
          (pr_scores.mean(), pr_scores.std() * 2))
    print("\t Cross validated Recall: %0.3f (+/- %0.3f)" %
          (re_scores.mean(), re_scores.std() * 2))
    print("\t Cross validated F1: %0.3f (+/- %0.3f)" %
          (f1_scores.mean(), f1_scores.std() * 2))

    return acc_scores.mean()
Example #8
0
def elaborateText(tweet_text):
    #removing urls
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                      '(?:%[0-9a-fA-F][0-9a-fA-F]))+','',  tweet_text)
    
    
    #removing emoticon
    text = remove_emoji(text)
    
    text = re.sub("(@[A-Za-z0-9_]+)","", text)
    
    text = text.lower()
    
    for punct_sign in string.punctuation:
        text = text.replace(punct_sign," ")
        
    #others puntuations not within the set of string.punctuation->{!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~}
    text = text.replace("’"," ")
    text = text.replace("”"," ")
    text = text.replace("“"," ")
    text = text.replace("\n"," ")
    
    # removing numbers  
    text = re.sub(r'\d+', '', text)
    
    #tokenaziton
    tokens = nltk.word_tokenize(text)
    
    #removing stopwords
    stopWords = stopwords.words('english')
    filteredTokens = []
    filteredTokens = [word for word in tokens if word not in stopWords]
    
    #lemmatization
    wn.ensure_loaded()

    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    lemmatized_sentence_joined = ""
    tokensTagging = pos_tag(tokens)
    
    # pos tagging & lemmatization
    for word, tag in tokensTagging:

        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
        
    lemmatized_sentence_set = ' '.join(lemmatized_sentence)
  
    return lemmatized_sentence_set
def get_clean_content(file: str):
    # https://stackoverflow.com/questions/27433370/what-would-cause-wordnetcorpusreader-to-have-no-attribute-lazycorpusloader
    # not the best fix but it works
    nltk_load_lock.acquire()
    wordnet.ensure_loaded()
    nltk_load_lock.release()

    meta, content = extract(file)

    if content is not None:
        lang = meta["language"]
        content = clean(content, lang)

    return meta, content
    def get(self, request, format=None):
        wordnet.ensure_loaded()

        seminars = self.request.query_params.get('seminars')
        guid = self.request.query_params.get('guid')
        user = get_user(guid)

        similarities = {}

        for seminar_id in seminars.split(','):
            try:
                # Load seminar and its keywords
                seminar = Seminar.objects.get(id=seminar_id)
                keywords = json.loads(seminar.keywords)

                # User has not set any interests or seminar has no keywords so set similarity to 0
                if not user.interests or not keywords:
                    similarities[seminar.id] = 0
                    continue

                # Get synsets of words for user interests and keywords
                interest_syns = set(
                    synset for interest in user.interests
                    for synset in wordnet.synsets(interest)
                )

                keyword_syns = set(
                    synset for keyword in keywords[0:3]
                    for synset in wordnet.synsets(keyword['text'])
                )

                # If no synsets of words, set similiarity to 0
                if not interest_syns or not keyword_syns:
                    similarities[seminar.id] = 0
                    continue

                # Calculate best similarity between the sets of words
                best = max(
                    wordnet.wup_similarity(i, j) or 0
                    for i, j in product(interest_syns, keyword_syns)
                )

                # Convert to percentage and round to 1 decimal place
                similarities[seminar.id] = round(best * 100, 1)
            except Seminar.DoesNotExist:
                similarities[seminar.id] = 0
                continue

        return Response(similarities)
Example #11
0
def preload(fill_cache: bool):
    """
    Pre-loads any data so the user experience is better, i.e. there is less delay during.
    :param fill_cache: if true, will run all parsing tests to fill the cache for the semantic distance function.
    """

    # Preload the WordNet dictionary.
    print('Loading WordNet...')
    wn.ensure_loaded()

    if fill_cache:
        print('Filling Cache (Running Tests)...')
        loader = TestLoader()
        suite = loader.discover(start_dir='tests/parsing')
        TextTestRunner(verbosity=1).run(suite)
Example #12
0
    def __init__(self):
        #news = pd.read_csv("classifier/training_dataset.csv", names=['id', 'text', 'category'])

        # load the label encoder to decode category numbers
        #self.encoder = LabelEncoder()
        #self.encoder.fit_transform(news['category'])

        # load the text classifer
        #self.text_clf = open("classifier/nb_classifier.pkl", "rb")
        #self.text_clf = pickle.load(self.text_clf)

        self.porter = PorterStemmer()

        # prevents odd nltk error
        # https://stackoverflow.com/questions/27433370/what-would-cause-wordnetcorpusreader-to-have-no-attribute-lazycorpusloader
        wn.ensure_loaded()
Example #13
0
def main():
    wn.ensure_loaded()
    users = get_users()

    #users = [was_user_correct(user) for user in users]
    #print("Linguistic features..")
    #users = Parallel(n_jobs=num_jobs)(delayed(linguistic_f)(user) for user in users)
    #print("Calculating tweet sentiment for each user")
    users = Parallel(n_jobs=num_jobs)(
        delayed(feature_user_tweet_sentiment)(user) for user in users)
    print("Avg time to retweet")
    users = Parallel(n_jobs=num_jobs)(delayed(time_til_retweet)(user)
                                      for user in users)
    print([u.sent_tweets_avg for u in users[:10]])
    print([u.avg_time_to_retweet for u in users[:10]])
    [store_result(user) for user in users]
def main():
    global bow_corpus
    global word_to_idx, idx_to_word
    global bow_corpus_top_n
    global users
    wn.ensure_loaded()
    bow_corpus = get_corpus()
    users = get_users()

    bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2]
    #print("Corpus size: {}".format(len(bow_corpus_tmp)))

    word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)}
    idx_to_word = {idx: k for k, idx in word_to_idx.items()}

    #for n in range(0,10,1):
    lstm_pred(-1)
def preprocess_text(docs):
    num_task = os.cpu_count()
    len_slices = len(docs) // num_task
    remainder_slices = len(docs) % num_task

    texts = []
    stoplist = set(stopwords.words('english'))
    
    wn.ensure_loaded()
    t_start = time.perf_counter()
    with ProcessPoolExecutor(max_workers=num_task) as executor:

        futures_tokenize = []
        for n in range(0, num_task):

            upper_bound = (n+1) * len_slices
            if n == num_task - 1:
                upper_bound = (n+1) * len_slices + remainder_slices

            print(n, upper_bound)
            futures_tokenize.append(executor.submit(preprocess_tokenize, docs[n * len_slices:upper_bound],
                            stoplist))

        for future in concurrent.futures.as_completed(futures_tokenize):
            texts += future.result()

    t_stop = time.perf_counter()
    print("removed stopwords and lemmatized in {} s".format(t_stop - t_start))
    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phraser(Phrases(texts, min_count=20))
    for idx in range(len(texts)):
        for token in bigram[texts[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                texts[idx].append(token)

    print("Done bigrams")
    dictionary = Dictionary(texts)
    dictionary.filter_extremes(no_below=30, no_above=0.5)
    dictionary.filter_tokens(bad_ids=[dictionary.token2id["like"]])
    special_tokens = {'_pad_': 0}
    dictionary.patch_with_special_tokens(special_tokens)

    return texts, dictionary
Example #16
0
    def IndexWebSite(self, url, urlSet, depth, urlAmount, flag=False):
        print("indexleme basladi")
        keywords = self.FindKeywords(url, 5)
        print(keywords)
        if (len(keywords) == 0): return []
        threads = []
        self.resultArr = []
        wn.ensure_loaded()
        for i in range(len(urlSet)):
            t = Thread(target=self.IndexSiteWithThread,
                       args=(urlSet[i], depth, urlAmount, keywords))
            t.start()
            threads.append(t)

        for thread in threads:
            thread.join()

        self.bubbleSort(self.resultArr)
        return self.resultArr
Example #17
0
def main():
    wn.ensure_loaded()
    # batch
    i = int(sys.argv[1])
    user_files = sorted(glob.glob(DIR + 'user_tweets/' + 'user_*.json'))
    r = 1960
    user_files = user_files[(i-1)*r:min(r*i,len(user_files))]
    users = get_users(user_files)
    facts, transactions = get_data()
    users = Parallel(n_jobs=num_jobs)(delayed(was_user_correct)(user, facts, transactions) for user in users)
    # print("Linguistic features..")
    # users = Parallel(n_jobs=num_jobs)(delayed(linguistic_f)(user) for user in users)
    # print("Calculating tweet sentiment for each user")
    # users = Parallel(n_jobs=num_jobs)(delayed(feature_user_tweet_sentiment)(user) for user in users)
    # print("Avg time to retweet")
    # users = Parallel(n_jobs=num_jobs)(delayed(time_til_retweet)(user) for user in users)
    # print([u.sent_tweets_avg for u in users[:10]])
    # print([u.avg_time_to_retweet for u in users[:10]])
    [store_result(user) for user in users]
Example #18
0
def main():
    global bow_corpus
    global word_to_idx
    wn.ensure_loaded()
    if NEW_CORPUS:
        bow_corpus = build_bow_corpus(get_users())
        save_corpus(bow_corpus)
    else:
        bow_corpus = get_corpus()

    bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2]
    word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)}
    idx_to_word = {idx: k for k, idx in word_to_idx.items()}

    users = get_users()
    facts = gt.get_fact_topics()
    transactions = gt.get_transactions()
    users_df = pd.DataFrame([vars(u) for u in users])
    print(users_df.describe())
    print(users_df[users_df['stance'] == 0].describe())
    print(users_df[users_df['stance'] == 1].describe())
    print(users_df[users_df['stance'] == 2].describe())
    print(users_df[users_df['stance'] == 3].describe())
    users_df['f_t'] = users_df['fact'].map(
        lambda x: facts[facts['hash'] == x]['true'].values[0])
    c_true = users_df['f_t'] == '1'
    c_fal = users_df['f_t'] == '0'
    c_fal1 = users_df['f_t'] == 0
    c_den = users_df['stance'] == 0
    c_sup = users_df['stance'] == 1
    print(users_df[c_true & c_sup].describe())
    print(users_df[c_fal | c_fal1][c_den].describe())
    print(users_df[c_fal | c_fal1][c_sup].describe())
    print(users_df[c_true & c_den].describe())
    print(users_df[users_df['was_correct'] == 1].describe())
    print(users_df[users_df['was_correct'] == 0].describe())
    print(len([t for u in users for t in u.tweets if u.tweets is not None]))

    corpus_analysis(bow_corpus, word_to_idx, idx_to_word)
    # temporal_analysis(get_users())

    cluster_users_on_tweets(users, word_to_idx, idx_to_word)
Example #19
0
def main():
    global bow_corpus
    global word_to_idx
    wn.ensure_loaded()
    if NEW_CORPUS:
        bow_corpus = build_bow_corpus(get_users())
        save_corpus(bow_corpus)
    else:
        bow_corpus = get_corpus()

    bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2]
    word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)}
    idx_to_word = {idx: k for k, idx in word_to_idx.items()}

    users = get_users()

    corpus_analysis(bow_corpus, word_to_idx, idx_to_word)
    # temporal_analysis(get_users())

    cluster_users_on_tweets(users, word_to_idx, idx_to_word)
Example #20
0
 def __init__(self):
     print('loading the wordnet corpus...')
     wordnet.ensure_loaded()
     print('loading done')
     self.nlp = spacy.load('en')
     self.nlp.add_pipe(WordnetAnnotator(self.nlp.lang), after='tagger')
     f = open('sorted_first_names.txt', 'r')
     lines = f.readlines()
     self.first_name_array = []
     for line in lines:
         line = line.rstrip()
         self.first_name_array.append(line)
     f = open('sorted_last_names.txt', 'r')
     lines = f.readlines()
     self.last_name_array = []
     for line in lines:
         line = line.rstrip()
         self.last_name_array.append(line)
     f = open('bad_words.txt', 'r')
     lines = f.readlines()
     self.profane_words_array = []
     for line in lines:
         line = line.rstrip()
         self.profane_words_array.append(line)
    def search(self, queryString,search_length=10,return_rank_list=False):
        wn.ensure_loaded()
        stop_words = set(stopwords.words('english'))
        porter_stemmer = PorterStemmer()
        wordnet_lemmatizer = WordNetLemmatizer()
        query = word_tokenize(queryString)
        query=[w.lower() for w in query if (w.isalpha() and w not in stop_words)]
        query=[(wordnet_lemmatizer.lemmatize(w)) for w in query]
        query=[porter_stemmer.stem(w) for w in query]

        self.processQuery( self.vocab, query )

        #Getting the page ranking for the above query
        obj = CosineScore(self.queryVector, self.tfidfMatrix)

        rankList = obj.getPages(search_length)
        if return_rank_list==True:
            return rankList

        #Getting the id and url name
        finalList = []
        for docIndex in rankList:
            finalList.append((self.titleList[docIndex], self.urlList[docIndex]))
        return (finalList)
def process(file_input_name, file_output_name):
    try:
        wordnet.ensure_loaded()
        data = pd.read_csv(file_input_name, encoding='ISO-8859-1')
        input_text = data['text']
        threads = []
        result = []

        for i in range(10):
            t = lemmatizeThread(thread_name='thread' + str(i),
                                the_queue=workQueue)
            t.start()
            threads.append(t)

        threadLock.acquire()
        for i in range(len(input_text) - 1):
            workQueue.put(str(i) + '-' + input_text[i])
        threadLock.release()

        while not workQueue.empty():
            pass

        global exitFlag
        exitFlag = 1

        for t in threads:
            t.join()
            result.append(t.data)

        with open(file_output_name, 'w') as f:
            while (not resultQueue.empty()):
                result = resultQueue.get()
                f.write(result + '\n')

    except Exception as e:
        print(e)
Example #23
0
from nltk.corpus import wordnet as wn
#right = wn.synset('right_whale.n.01')
wn.ensure_loaded()
help(wn)

Example #24
0
def main():
    global bow_corpus
    global word_to_idx, idx_to_word, fact_to_words
    global bow_corpus_top_n
    wn.ensure_loaded()
    print('Grabbing Data')
    bow_corpus = gt.get_corpus()
    facts = gt.get_fact_topics()
    facts = facts[facts['true'] != 'unknown']

    bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2]
    word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)}
    idx_to_word = {idx: k for k, idx in word_to_idx.items()}
    fact_to_words = {
        r['hash']: [w for w in r['fact_terms']]
        for index, r in facts[['hash', 'fact_terms']].iterrows()
    }

    if NEW_MODEL:
        users = gt.get_users()
        # Prepping lstm model
        top_words = 50000
        X, y, user_order = lstm_cred.get_prebuilt_data()
        X, y, user_order = lstm_cred.balance_classes(X, y, user_order)

        #X_train, X_test, y_train, y_test = train_test_split_every_user(X, y, user_order)
        #X_train, X_test, y_train, y_test = train_test_split_on_facts(X, y, user_order, facts_train.values, users)
        #X_train, X_test, y_train, y_test = lstm_cred.train_test_split_on_users(X, y, user_order, users, 100)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.5)

        X_train, X_test, word_to_idx = lstm_cred.keep_n_best_words(
            X_train, y_train, X_test, y_test, idx_to_word, top_words)
        max_tweet_length = 12
        X_train = sequence.pad_sequences(X_train, maxlen=max_tweet_length)
        X_test = sequence.pad_sequences(X_test, maxlen=max_tweet_length)

        # Training lstm model
        embedding_vecor_length = 32
        model = Sequential()
        model.add(
            Embedding(top_words,
                      embedding_vecor_length,
                      input_length=max_tweet_length))
        model.add(Dropout(0.2))
        model.add(LSTM(100))
        model.add(Dropout(0.2))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        print(model.summary())
        model.fit(X_train,
                  y_train,
                  validation_data=(X_test, y_test),
                  epochs=5,
                  batch_size=64)
        model.save('model_data/cred_model.h5')
        scores = model.evaluate(X_test, y_test, verbose=0)
        print("Accuracy: %.2f%%" % (scores[1] * 100))

        if NEW_REL_TWEETS:
            print('Building new relevant tweets')
            users = Parallel(n_jobs=num_jobs)(
                delayed(get_relevant_tweets)(user) for user in users)
            #users = Parallel(n_jobs=num_jobs)(delayed(get_relevant_tweets_test_set)(user, X_test) for user in users)
            user_to_rel_tweet = {
                user.user_id: user.features['relevant_tweets']
                for user in users if 'relevant_tweets' in user.features
            }
            with open('model_data/relevant_tweets.pkl', 'wb') as tmpfile:
                pickle.dump(user_to_rel_tweet, tmpfile)
        else:
            with open('model_data/relevant_tweets.pkl', 'rb') as tmpfile:
                user_to_rel_tweet = pickle.load(tmpfile)
            for user in users:
                if 'relevant_tweets' in user.features:
                    user.features['relevant_tweets'] = user_to_rel_tweet[
                        user.user_id]

        # Build credibility scores for all users on their topic
        print('Computing credibility')
        users = [prebuild_cred(model, u) for u in users]
        users_df = pd.DataFrame([vars(u) for u in users])

        [store_result(u) for u in users]
        with open('model_data/cred_pred_data', 'wb') as tmpfile:
            pickle.dump({'users': users_df, 'map': word_to_idx}, tmpfile)
    else:
        print('Loading users & model')
        with open('model_data/cred_pred_data', 'rb') as tmpfile:
            construct = pickle.load(tmpfile)
        users_df = construct['users']
        word_to_idx = construct['map']

    print('Making cred*sent predictions')
    X = []
    y = []
    for idx, hsh in enumerate(facts['hash'].values):
        this_users = users_df[users_df['fact'] == hsh]
        this_x = cred_stance_prediction(this_users)
        this_y = facts['true'].iloc[idx]
        X.append((np.average(this_x), np.std(this_x)))
        y.append(int(this_y))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    std_clf = make_pipeline(StandardScaler(), LinearSVC)
    std_clf.fit(X_train, y_train)
    pred = std_clf.predict(X_test)

    score = metrics.accuracy_score(y_test, pred)
    precision, recall, fscore, sup = metrics.precision_recall_fscore_support(
        y_test, pred, average='macro')
    print(
        "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f"
        % (score, precision, recall, fscore))
    acc_scores = cross_val_score(std_clf, X, y, cv=3)
    pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3)
    re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3)
    f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3)
    print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" %
          (acc_scores.mean(), acc_scores.std() * 2))
    print("\t Cross validated Precision: %0.3f (+/- %0.3f)" %
          (pr_scores.mean(), pr_scores.std() * 2))
    print("\t Cross validated Recall: %0.3f (+/- %0.3f)" %
          (re_scores.mean(), re_scores.std() * 2))
    print("\t Cross validated F1: %0.3f (+/- %0.3f)" %
          (f1_scores.mean(), f1_scores.std() * 2))

    print('Making cred*stance predictions')
    X = []
    y = []
    all_evidence = []
    for idx, hsh in enumerate(facts['hash'].values):
        this_users = users_df[users_df['fact'] == hsh]
        this_x, evidence = only_cred_support_deny_pred(this_users)
        this_y = facts['true'].iloc[idx]
        evidence = sorted(evidence, reverse=True, key=lambda x: x[0])
        # print(facts[facts['hash']==hsh]['text'].values, int(this_y), this_x[-1])
        # print(evidence if len(evidence) <3 else evidence[:3])
        X.append((np.average(this_x), np.std(this_x)))
        y.append(int(this_y))
    print(X[:20])
    print(y[:20])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    std_clf = make_pipeline(StandardScaler(), LinearSVC())
    std_clf.fit(X_train, y_train)
    pred = std_clf.predict(X_test)

    score = metrics.accuracy_score(y_test, pred)
    precision, recall, fscore, sup = metrics.precision_recall_fscore_support(
        y_test, pred, average='macro')
    print(
        "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f"
        % (score, precision, recall, fscore))
    acc_scores = cross_val_score(std_clf, X, y, cv=3)
    pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3)
    re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3)
    f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3)
    print(acc_scores)
    print(pr_scores)
    print(re_scores)
    print(f1_scores)
    print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" %
          (acc_scores.mean(), acc_scores.std() * 2))
    print("\t Cross validated Precision: %0.3f (+/- %0.3f)" %
          (pr_scores.mean(), pr_scores.std() * 2))
    print("\t Cross validated Recall: %0.3f (+/- %0.3f)" %
          (re_scores.mean(), re_scores.std() * 2))
    print("\t Cross validated F1: %0.3f (+/- %0.3f)" %
          (f1_scores.mean(), f1_scores.std() * 2))
Example #25
0
def text_processing(ques1, ques2):
    # from nltk.corpus.reader.wordnet import WordNetError
    from nltk.stem.porter import PorterStemmer
    from nltk.stem import WordNetLemmatizer
    from nltk.tokenize import word_tokenize
    from nltk.corpus import wordnet as wn
    from nltk.corpus import stopwords
    import num2words as nw
    import string
    """Function to remove punctions in the strings"""
    r_p1 = list(
        map(
            lambda ques: ''.join(
                [word for word in ques1 if word not in string.punctuation]),
            [ques1]))
    r_p2 = list(
        map(
            lambda ques: ''.join(
                [word for word in ques2 if word not in string.punctuation]),
            [ques2]))
    """Function to create word token from the document"""
    w_t1 = list(
        map(
            lambda r_p: ' '.join([
                nw.num2words(word) if word.isdigit() else word
                for word in word_tokenize(r_p[0].replace("°", "").replace(
                    "²", ""))
            ]), [r_p1]))
    w_t2 = list(
        map(
            lambda r_p: ' '.join([
                nw.num2words(word) if word.isdigit() else word
                for word in word_tokenize(r_p[0].replace("°", "").replace(
                    "²", ""))
            ]), [r_p2]))
    l_w_t1 = len(word_tokenize(r_p1[0]))
    l_w_t2 = len(word_tokenize(r_p1[0]))
    """Function to remove stop words from the document"""
    wn.ensure_loaded()
    words = stopwords.words('english')
    r_s_w1 = list(
        map(
            lambda w_t: " ".join(
                [word for word in w_t[0].split() if word not in words]),
            [w_t1]))
    r_s_w2 = list(
        map(
            lambda w_t: " ".join(
                [word for word in w_t[0].split() if word not in words]),
            [w_t2]))
    l_r_s_w1 = len(word_tokenize(r_s_w1[0]))
    l_r_s_w2 = len(word_tokenize(r_s_w2[0]))
    """Function to stem tokens of string"""
    stemmer = PorterStemmer()
    stems1 = list(
        map(
            lambda r_s_w: " ".join(
                [stemmer.stem(word) for word in r_s_w[0].split(" ")]),
            [r_s_w1]))
    stems2 = list(
        map(
            lambda r_s_w: " ".join(
                [stemmer.stem(word) for word in r_s_w[0].split(" ")]),
            [r_s_w2]))
    """Function to lemmatize tokens of string"""
    lemmatizer = WordNetLemmatizer()
    lamit1 = list(
        map(
            lambda stems: " ".join(
                [lemmatizer.lemmatize(word) for word in stems[0].split()]),
            [stems1]))
    lamit2 = list(
        map(
            lambda stems: " ".join(
                [lemmatizer.lemmatize(word) for word in stems[0].split()]),
            [stems2]))

    # print([lamit1[0], lamit2[0]], [int(l_w_t1 / l_w_t2), int(l_r_s_w1 / l_r_s_w2)])
    return [lamit1[0],
            lamit2[0]], [int(l_w_t1 / l_w_t2),
                         int(l_r_s_w1 / l_r_s_w2)]
 def get_queryset(self):
     wordnet.ensure_loaded()
     # Wordnet incorporates a lazy corpus model, which starts loading
     # only on first call to itself, which can cause some issues with
     # multithreading
     return
outputFile = open("result_trigram.txt", "w+")

corpusLines = inputFile.readlines()
totalLines = len(corpusLines)
print(" Total Process : ", totalProcess)
if (totalLines < totalProcess):
    print(" Total needed process : ", totalLines)
    totalProcess = totalLines

allocationForProcess = int(totalLines / totalProcess)
lastAllocation = totalLines - allocationForProcess * totalProcess
print(" Total Lines : ", totalLines)
print(" Allocation for a Process : ", allocationForProcess)
print(" Last Allocation for a Process : ", lastAllocation)

wordnet.ensure_loaded()  # first access to wn transforms it

if (allocationForProcess > 0):
    # linePrinter.initCompletedstate(totalProcess+1)
    processes = [
        Thread(target=readByLines,
               args=(processId, corpusLines, processId * allocationForProcess,
                     (processId + 1) * allocationForProcess,
                     processId * allocationForProcess, True,
                     allocationForProcess))
        for processId in range(totalProcess)
    ]

    # Run processes
    for p in processes:
        p.start()
Example #28
0
    inputFile = open(basePath + inputFileName, "r")
    corpusLines = inputFile.readlines()
    totalLines = len(corpusLines)
    print(" Started File - ", inputFileName, " | Total Lines : ", totalLines)
    if not os.path.exists(outputPath + inputFileName):
        os.makedirs(outputPath + inputFileName)
    readByLines(corpusLines, totalLines, inputFileName)
    inputFile.close()
    return inputFileName


startTime = datetime.now()
print("Process started : ", startTime)
print("Total Files : ", TOTAL_FILES_FOR_READ)
FileNames = []
for i in range(0, TOTAL_FILES_FOR_READ):
    listToken = []
    if (i < 9):
        FileNames.append("news.en-0000" + str(i + 1) + "-of-00100")
    else:
        FileNames.append("news.en-000" + str(i + 1) + "-of-00100")

wordnet.ensure_loaded()
with concurrent.futures.ProcessPoolExecutor() as executor:
    for lemma, result in zip(FileNames, executor.map(doWorker, FileNames)):
        print(f"Finished for input file {lemma} was saved inside the {result}")

endTime = datetime.now()
print("\n\n Process Stopped : ", endTime)
print("\n\n Duration : ", endTime - startTime)
Example #29
0
import nltk
from nltk.corpus import wordnet


__all__ = (
    'get_related',
    'fix_determiners'
)


# NLTK seems to check for updates and tries to unzip the corpus even when it’s
# already installed, which slows down the import, so only invoke nltk.download()
# if wordnet isn’t already available.
try:
    wordnet.ensure_loaded()
except LookupError:
    try:
        nltk.download('wordnet')
    except OSError as exc:
        raise ImportError("Could not download WordNet:", exc)


_VOWELS = "aeiou"


def get_related(query, pos='n'):
    """If query is for a noun, return a random hyponym for query. If query is
    for an adjective, return a related adjective.

    Args:
Example #30
0
def main():
    global bow_corpus
    global word_to_idx, idx_to_word, fact_to_words
    global bow_corpus_top_n
    wn.ensure_loaded()
    print('Grabbing Data')
    bow_corpus = gt.get_corpus()
    facts = gt.get_fact_topics()
    facts = facts[facts['true'] != 'unknown']

    bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2]
    word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)}
    idx_to_word = {idx: k for k, idx in word_to_idx.items()}
    fact_to_words = {
        r['hash']: [w for w in r['fact_terms']]
        for index, r in facts[['hash', 'fact_terms']].iterrows()
    }

    # Credibility data
    print('Loading users & model')
    with open('model_data/cred_pred_data', 'rb') as tmpfile:
        construct = pickle.load(tmpfile)
    users_df = construct['users']
    word_to_idx = construct['map']
    # Feature data
    with open('model_data/feature_data', 'rb') as tmpfile:
        fact_features = pickle.load(tmpfile)
    features = [
        'avg_links', 'avg_sent_neg', 'avg_sentiment', 'fr_has_url', 'lvl_size',
        'avg_len', 'avg_special_symbol', 'avg_time_retweet',
        'avg_count_distinct_words', 'avg_sent_pos', 'cred_pred',
        'cred_pred_std'
    ]

    print('Making cred*stance +best features predictions')
    facts['cred_pred'] = facts['hash'].map(
        lambda x: only_cred_support_deny_pred(users_df[users_df['fact'] == x]))
    facts['cred_pred_std'] = facts['cred_pred'].map(lambda x: np.std(x))
    facts['cred_pred'] = facts['cred_pred'].map(lambda x: x[-1])
    facts = facts.set_index('hash').join(fact_features.set_index('hash'),
                                         rsuffix='_other')
    X = facts[features].values
    y = facts['y'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    std_clf = make_pipeline(StandardScaler(), SVC(C=1, gamma=1))
    std_clf.fit(X_train, y_train)
    pred = std_clf.predict(X_test)

    score = metrics.accuracy_score(y_test, pred)
    precision, recall, fscore, sup = metrics.precision_recall_fscore_support(
        y_test, pred, average='macro')
    print(
        "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f"
        % (score, precision, recall, fscore))
    acc_scores = cross_val_score(std_clf, X, y, cv=3)
    pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3)
    re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3)
    f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3)
    print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" %
          (acc_scores.mean(), acc_scores.std() * 2))
    print("\t Cross validated Precision: %0.3f (+/- %0.3f)" %
          (pr_scores.mean(), pr_scores.std() * 2))
    print("\t Cross validated Recall: %0.3f (+/- %0.3f)" %
          (re_scores.mean(), re_scores.std() * 2))
    print("\t Cross validated F1: %0.3f (+/- %0.3f)" %
          (f1_scores.mean(), f1_scores.std() * 2))
Example #31
0
    def run(self):

        out_dir = flags.cgmh_output_dir
        fout = open(os.path.join(out_dir, "log" + str(self.__idx) + ".log"),
                    "w")
        res_path = os.path.join(out_dir, "res" + str(self.__idx) + ".res")

        bb_atk_data = self.__bb_atk_data
        bb_atk_data_size = len(self.__bb_atk_data['raw'])
        bb_word2idx = self.__bb_word2idx
        bb_idx2word = self.__bb_idx2word
        vocab = self.__vocab
        bb = self.__bb
        m = self.__model
        bb_max_seqlen = self.__bb_max_seqlen
        sess = self.__sess
        negations = self.__negs

        op_prob = [
            flags.swp_prob, flags.ins_prob, flags.del_prob, flags.pass_prob
        ]
        op_prob = op_prob / numpy.sum(op_prob)
        n_sample = flags.sample_max_n
        n_candidate = flags.n_candidate
        just_acc_rate = flags.just_acc_rate
        swp_lm_threshold = flags.lm_swp_threshold
        ins_lm_threshold = flags.lm_ins_threshold
        del_lm_threshold = flags.lm_del_threshold
        swp_prob_threshold = flags.swp_threshold
        ins_prob_threshold = flags.ins_threshold
        del_prob_threshold = flags.del_threshold
        swn_obj_threshold = flags.senti_obj_threshold
        swn_pos_threshold = flags.senti_pos_threshold
        seq_min_len = flags.seq_min_len
        mode = flags.index_mode

        res_log = []
        sents = []
        idx = 0
        op = 3

        total_time = 0
        n_succ = 0

        lemmatzr = WordNetLemmatizer()

        for i in range(bb_atk_data_size):

            start_time = time.time()

            print("===== DATA %d/%d =====" % (i + 1, bb_atk_data_size),
                  file=fout,
                  flush=True)
            print("DATA %d/%d, id=%d" % (i + 1, bb_atk_data_size, self.__idx))
            flush()
            res_log.append([])
            raw = copy.deepcopy(bb_atk_data["raw"][i])
            raw = nltk.word_tokenize(raw.lower())
            l = len(raw) + 1
            if (l > flags.seq_max_len):
                l = flags.seq_max_len
            seq = [vocab.get_init_idx()]
            for ii in range(1, l):
                seq.append(vocab.get_vocab_idx(raw[ii - 1]))
            while len(seq) < flags.seq_max_len:
                seq.append(vocab.get_pad_idx())
            mask = [True]
            for ii in range(1, l):
                mask.append(False)

            bb_y = bb_atk_data["y"][i]
            bb_l = len(raw)
            if (bb_l > bb_max_seqlen):
                bb_l = bb_max_seqlen
            bb_seq = []
            for ii in range(bb_l):
                if raw[ii] in bb_word2idx.keys():
                    bb_seq.append(bb_word2idx[raw[ii]])
                else:
                    bb_seq.append(bb_word2idx["<unk>"])
            while len(bb_seq) < bb_max_seqlen:
                bb_seq.append(bb_word2idx["<pad>"])

            sents.append([])
            sample_cnt = 0
            sample_all = 0
            idx = 0
            sents[-1].append(copy.deepcopy(raw))
            print("%d/%d\tOriginal\tFAIL with %.5f" %
                  (i + 1, bb_atk_data_size, 1 - bb_atk_data["prob"][i]),
                  end="\n\t",
                  file=fout,
                  flush=True)
            for ii in range(len(raw)):
                print(raw[ii], end=" ", file=fout, flush=True)
            if bb_y == 1:
                print("\t<POS>", file=fout, flush=True)
            else:
                print("\t<NEG>", file=fout, flush=True)

            while sample_all < n_sample:

                try:

                    wn.ensure_loaded()

                    sample_all += 1
                    op = random_pick_idx_with_unnormalized_prob(op_prob)
                    succ = False
                    if op == 3:
                        tmp_prob = sess.run(bb.prob,
                                            feed_dict={
                                                bb.X: [bb_seq],
                                                bb.L: [bb_l]
                                            })[0][1 - bb_y]
                        if tmp_prob >= 0.5:
                            res_log[i].append((sample_all, 1))
                            print(
                                "%d/%d\t%d acc / %d all\tPASS\t SUCC with %.5f"
                                % (i + 1, bb_atk_data_size, sample_cnt + 1,
                                   sample_all + 1, tmp_prob),
                                file=fout,
                                flush=True)
                            succ = True
                        else:
                            res_log[i].append((sample_all, 0))
                            print(
                                "%d/%d\t%d acc / %d all\tPASS\t FAIL with %.5f"
                                % (i + 1, bb_atk_data_size, sample_cnt + 1,
                                   sample_all + 1, tmp_prob),
                                file=fout,
                                flush=True)
                        sample_cnt += 1
                        sents[-1].append(copy.deepcopy(raw))
                        print("", end="\t", file=fout, flush=True)
                        for ii in range(len(raw)):
                            print(raw[ii], end=" ", file=fout, flush=True)
                        if bb_y == 1:
                            print("\t<POS>", file=fout, flush=True)
                        else:
                            print("\t<NEG>", file=fout, flush=True)
                        if succ:
                            print("\tSUCC!")
                            flush()
                            break
                        continue

                    if mode == "random":
                        idx = random.randint(0, l - 1)
                    elif mode == "traverse":
                        idx = (idx + 1) % l
                    elif mode == "grad":
                        if op == 1:
                            idx = random.randint(0, l - 1)
                        else:
                            grad_vecs = sess.run(bb.embed_grad,
                                                 feed_dict={
                                                     bb.X: [bb_seq],
                                                     bb.L: [bb_l],
                                                     bb.Y: [1 - bb_y]
                                                 })[0][0]
                            grads = numpy.linalg.norm(grad_vecs, axis=-1)
                            candidate_grads = []
                            candidate_idxs = []
                            position_tag = nltk.pos_tag(raw)
                            for pos in range(len(position_tag)):
                                tmp_tag = get_part_of_speech(
                                    position_tag[pos][1])
                                if tmp_tag is None:
                                    candidate_grads.append(grads[pos])
                                    candidate_idxs.append(pos + 1)
                                    continue
                                tmp_wn = wn.synsets(lemmatzr.lemmatize(
                                    raw[pos]),
                                                    pos=tmp_tag)
                                if len(tmp_wn) <= 0:
                                    candidate_grads.append(grads[pos])
                                    candidate_idxs.append(pos + 1)
                                    continue
                                tmp_swn = swn.senti_synset(tmp_wn[0].name())
                                if (tmp_swn.obj_score() > swn_obj_threshold \
                                    or (tmp_swn.obj_score() <= swn_obj_threshold \
                                        and abs(tmp_swn.pos_score()-tmp_swn.neg_score()) <= swn_pos_threshold)):
                                    candidate_grads.append(grads[pos])
                                    candidate_idxs.append(pos + 1)
                                    continue
                            idx_idx = random_pick_idx_with_unnormalized_prob(
                                candidate_grads)
                            idx = candidate_idxs[idx_idx]
                    else:
                        assert False, "Invalid mode \"" + mode + "\""

                    old_wrong_prob = sess.run(bb.prob,
                                              feed_dict={
                                                  bb.X: [bb_seq],
                                                  bb.L: [bb_l]
                                              })[0][1 - bb_y]
                    if op == 0:
                        if mask[idx]:
                            continue
                        proposal = m.op_replace(sess, copy.deepcopy(seq), l,
                                                copy.deepcopy(bb_seq), bb_l,
                                                1 - bb_y, idx, n_candidate,
                                                op_prob)
                        tmp_bb_seq = copy.deepcopy(bb_seq)
                        tmp_str = vocab.get_vocab(proposal['proposal'][idx])
                        if tmp_str in bb_word2idx.keys():
                            tmp_bb_seq[idx - 1] = bb_word2idx[tmp_str]
                        else:
                            tmp_bb_seq[idx - 1] = bb_word2idx["<unk>"]
                        new_wrong_prob = sess.run(bb.prob,
                                                  feed_dict={
                                                      bb.X: [tmp_bb_seq],
                                                      bb.L: [bb_l]
                                                  })[0][1 - bb_y]
                        tmp_raw = copy.deepcopy(raw)
                        tmp_raw[idx - 1] = vocab.get_vocab(
                            proposal["proposal"][idx])
                        new_tag = get_part_of_speech(
                            nltk.pos_tag(tmp_raw)[idx - 1][1])
                        if new_tag is None:
                            new_obj = 1
                            new_pos = 0
                        else:
                            new_wn = wn.synsets(lemmatzr.lemmatize(
                                tmp_raw[idx - 1]),
                                                pos=new_tag)
                            if len(new_wn) <= 0:
                                new_obj = 1
                                new_pos = 0
                            else:
                                new_swn = swn.senti_synset(new_wn[0].name())
                                new_obj = new_swn.obj_score()
                                new_pos = new_swn.pos_score(
                                ) - new_swn.neg_score()
                        if (just_acc(just_acc_rate)
                            or (numpy.random.uniform(0,1) <= \
                                proposal["alpha"] * new_wrong_prob / old_wrong_prob
                            and proposal["old_prob"] * swp_lm_threshold <= proposal["new_prob"]
                            and old_wrong_prob * swp_prob_threshold <= new_wrong_prob
                            and (new_obj > swn_obj_threshold        # objective
                                      or (new_obj <= swn_obj_threshold    # neutral
                                    and abs(new_pos) <= swn_pos_threshold))
                            and (tmp_str not in negations))):
                            if new_wrong_prob >= 0.5:
                                res_log[i].append((sample_all, 1))
                                print(
                                    "%d/%d\t%d acc / %d all\tSWP\t SUCC with %.5f\t[%s](%d) => [%s](%d) (%d)"
                                    % (i + 1, bb_atk_data_size, sample_cnt + 1,
                                       sample_all, new_wrong_prob,
                                       vocab.get_vocab(seq[idx]), seq[idx],
                                       vocab.get_vocab(
                                           proposal["proposal"][idx]),
                                       proposal["proposal"][idx], idx),
                                    file=fout,
                                    flush=True)
                                succ = True
                            else:
                                res_log[i].append((sample_all, 0))
                                print(
                                    "%d/%d\t%d acc / %d all\tSWP\t FAIL with %.5f\t[%s](%d) => [%s](%d) (%d)"
                                    % (i + 1, bb_atk_data_size, sample_cnt + 1,
                                       sample_all, new_wrong_prob,
                                       vocab.get_vocab(seq[idx]), seq[idx],
                                       vocab.get_vocab(
                                           proposal["proposal"][idx]),
                                       proposal["proposal"][idx], idx),
                                    file=fout,
                                    flush=True)
                            sample_cnt += 1
                            seq = proposal["proposal"]
                            bb_seq = tmp_bb_seq
                            raw = tmp_raw
                            sents[-1].append(copy.deepcopy(raw))
                            print("", end="\t", file=fout, flush=True)
                            for ii in range(len(raw)):
                                print(raw[ii], end=" ", file=fout, flush=True)
                            if bb_y == 1:
                                print("\t<POS>", file=fout, flush=True)
                            else:
                                print("\t<NEG>", file=fout, flush=True)
                        else:
                            print("%d/%d\t%d acc / %d all\tSWP\talpha %.2e" %
                                  (i + 1, bb_atk_data_size, sample_cnt,
                                   sample_all, proposal["alpha"]),
                                  file=fout,
                                  flush=True)

                    elif op == 1:
                        if idx == l - 1:
                            continue
                        proposal = m.op_insert(sess, copy.deepcopy(seq), l,
                                               copy.deepcopy(bb_seq), bb_l,
                                               1 - bb_y, idx, n_candidate,
                                               op_prob)
                        tmp_bb_seq = numpy.asarray(
                            copy.deepcopy(bb_seq)).tolist()
                        tmp_str = vocab.get_vocab(proposal['proposal'][idx +
                                                                       1])
                        if tmp_str in bb_word2idx.keys():
                            tmp_bb_seq = tmp_bb_seq[:idx] + [
                                bb_word2idx[tmp_str]
                            ] + tmp_bb_seq[idx:]
                        else:
                            tmp_bb_seq = tmp_bb_seq[:idx] + [
                                bb_word2idx["<unk>"]
                            ] + tmp_bb_seq[idx:]
                        tmp_bb_seq = tmp_bb_seq[:-1]
                        tmp_bb_l = bb_l + 1
                        if tmp_bb_l > bb_max_seqlen:
                            tmp_bb_l = bb_max_seqlen
                        new_wrong_prob = sess.run(bb.prob,
                                                  feed_dict={
                                                      bb.X: [tmp_bb_seq],
                                                      bb.L: [tmp_bb_l]
                                                  })[0][1 - bb_y]
                        tmp_raw = copy.deepcopy(raw)
                        tmp_raw = tmp_raw[:idx] + [tmp_str] + tmp_raw[idx:]
                        new_tag = get_part_of_speech(
                            nltk.pos_tag(tmp_raw)[idx][1])
                        if new_tag is None:
                            new_obj = 1
                            new_pos = 0
                        else:
                            new_wn = wn.synsets(lemmatzr.lemmatize(
                                tmp_raw[idx]),
                                                pos=new_tag)
                            if len(new_wn) <= 0:
                                new_obj = 1
                                new_pos = 0
                            else:
                                new_swn = swn.senti_synset(new_wn[0].name())
                                new_obj = new_swn.obj_score()
                                new_pos = new_swn.pos_score(
                                ) - new_swn.neg_score()
                        if (just_acc(just_acc_rate)
                            or (numpy.random.uniform(0,1) <= \
                                proposal["alpha"] * new_wrong_prob / old_wrong_prob
                            and proposal["old_prob"] * ins_lm_threshold <= proposal["new_prob"]
                            and old_wrong_prob * ins_prob_threshold <= new_wrong_prob
                            and (new_obj > swn_obj_threshold        # objective
                                      or (new_obj <= swn_obj_threshold   # neutral
                                     and new_pos <= swn_pos_threshold))
                            and (tmp_str not in negations))):
                            if new_wrong_prob >= 0.5:
                                res_log[i].append((sample_all, 1))
                                print(
                                    "%d/%d\t%d acc / %d all\tINS\t SUCC with %.5f\t[] => [%s](%d,%.1f,%.1f) (%d)"
                                    % (i + 1, bb_atk_data_size, sample_cnt + 1,
                                       sample_all, new_wrong_prob,
                                       vocab.get_vocab(
                                           proposal["proposal"][idx + 1]),
                                       proposal["proposal"][idx + 1], new_obj,
                                       new_pos, idx),
                                    file=fout,
                                    flush=True)
                                succ = True
                            else:
                                res_log[i].append((sample_all, 0))
                                print(
                                    "%d/%d\t%d acc / %d all\tINS\t FAIL with %.5f\t[] => [%s](%d,%.1f,%.1f) (%d)"
                                    % (i + 1, bb_atk_data_size, sample_cnt + 1,
                                       sample_all, new_wrong_prob,
                                       vocab.get_vocab(
                                           proposal["proposal"][idx + 1]),
                                       proposal["proposal"][idx + 1], new_obj,
                                       new_pos, idx),
                                    file=fout,
                                    flush=True)
                            sample_cnt += 1
                            seq = proposal["proposal"]
                            bb_seq = tmp_bb_seq
                            l += 1
                            mask = mask[:idx + 1] + [False] + mask[idx + 1:]
                            if l > flags.seq_max_len:
                                l = flags.seq_max_len
                            mask = mask[:l]
                            bb_l = tmp_bb_l
                            raw = raw[:idx] + [vocab.get_vocab(seq[idx + 1])
                                               ] + raw[idx:]
                            sents[-1].append(copy.deepcopy(raw))
                            print("", end="\t", file=fout, flush=True)
                            for ii in range(len(raw)):
                                print(raw[ii], end=" ", file=fout, flush=True)
                            if bb_y == 1:
                                print("\t<POS>", file=fout, flush=True)
                            else:
                                print("\t<NEG>", file=fout, flush=True)
                        else:
                            print("%d/%d\t%d acc / %d all\tINS\talpha %.2e" %
                                  (i + 1, bb_atk_data_size, sample_cnt,
                                   sample_all, proposal["alpha"]),
                                  file=fout,
                                  flush=True)

                    elif op == 2:
                        if mask[idx] or l - 1 < seq_min_len:
                            continue
                        proposal = m.op_delete(sess, copy.deepcopy(seq), l,
                                               copy.deepcopy(bb_seq), bb_l,
                                               1 - bb_y, idx, n_candidate,
                                               op_prob)
                        tmp_bb_seq = numpy.asarray(
                            copy.deepcopy(bb_seq)).tolist()
                        tmp_str = vocab.get_vocab(seq[idx])
                        tmp_bb_seq = tmp_bb_seq[:idx -
                                                1] + tmp_bb_seq[idx:] + [
                                                    bb_word2idx['<pad>']
                                                ]
                        tmp_bb_l = bb_l - 1
                        new_wrong_prob = sess.run(bb.prob,
                                                  feed_dict={
                                                      bb.X: [tmp_bb_seq],
                                                      bb.L: [tmp_bb_l]
                                                  })[0][1 - bb_y]
                        if (just_acc(just_acc_rate)
                            or (numpy.random.uniform(0,1) <= \
                                proposal["alpha"] * new_wrong_prob / old_wrong_prob
                            and proposal["old_prob"] * del_lm_threshold <= proposal["new_prob"]
                            and old_wrong_prob * del_prob_threshold <= new_wrong_prob)
                            and (tmp_str not in negations)):
                            if new_wrong_prob >= 0.5:
                                res_log[i].append((sample_all, 1))
                                print(
                                    "%d/%d\t%d acc / %d all\tDEL\t SUCC with %.5f\t[%s](%d) => [] (%d)"
                                    %
                                    (i + 1, bb_atk_data_size, sample_cnt + 1,
                                     sample_all, new_wrong_prob,
                                     vocab.get_vocab(seq[idx]), seq[idx], idx),
                                    file=fout,
                                    flush=True)
                                succ = True
                            else:
                                res_log[i].append((sample_all, 0))
                                print(
                                    "%d/%d\t%d acc / %d all\tDEL\t FAIL with %.5f\t[%s](%d) => [] (%d)"
                                    %
                                    (i + 1, bb_atk_data_size, sample_cnt + 1,
                                     sample_all, new_wrong_prob,
                                     vocab.get_vocab(seq[idx]), seq[idx], idx),
                                    file=fout,
                                    flush=True)
                            sample_cnt += 1
                            seq = proposal["proposal"]
                            bb_seq = tmp_bb_seq
                            l -= 1
                            mask = mask[:idx] + mask[idx + 1:]
                            bb_l = tmp_bb_l
                            raw = raw[:idx - 1] + raw[idx:]
                            sents[-1].append(copy.deepcopy(raw))
                            print("", end="\t", file=fout, flush=True)
                            for ii in range(len(raw)):
                                print(raw[ii], end=" ", file=fout, flush=True)
                            if bb_y == 1:
                                print("\t<POS>", file=fout, flush=True)
                            else:
                                print("\t<NEG>", file=fout, flush=True)
                        else:
                            print("%d/%d\t%d acc / %d all\tDEL\talpha %.2e" %
                                  (i + 1, bb_atk_data_size, sample_cnt,
                                   sample_all, proposal["alpha"]),
                                  file=fout,
                                  flush=True)

                    if succ:
                        end_time = time.time()
                        total_time += end_time - start_time
                        n_succ += 1
                        print("\tSUCC!")
                        print("\t\ttime =", total_time, n_succ)
                        flush()

                    assert len(mask) == l

                except Exception as e:

                    print("Something went wrong... Abort!",
                          file=fout,
                          flush=True)
                    print("Something went wrong... Abort! -- Thread %d" %
                          self.__idx)
                    print("\t", e)
                    sys.stdout.flush()
                    sys.stderr.flush()
                    continue

            with open(res_path, "wb") as f:
                pkl.dump((res_log, sents), f)
Example #32
0
Unit tests for nltk.corpus.wordnet
See also nltk/test/wordnet.doctest
"""

from __future__ import unicode_literals
from nose import SkipTest
import unittest
import os

from nltk.corpus.reader.wordnet import WordNetCorpusReader
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic as wnic
from nltk.data import find as find_data


wn.ensure_loaded()
S = wn.synset
L = wn.lemma

class WordnNetDemo(unittest.TestCase):

    def test_retrieve_synset(self):
        move_synset = S('go.v.21')
        self.assertEqual(move_synset.name(), "move.v.15")
        self.assertEqual(move_synset.lemma_names(), ['move', 'go'])
        self.assertEqual(move_synset.definition(), "have a turn; make one's move in a game")
        self.assertEqual(move_synset.examples(), ['Can I go now?'])


    def test_retrieve_synsets(self):
        self.assertEqual(sorted(wn.synsets('zap', pos='n')),