Beispiel #1
0
    def deserialize(self, type, name, language='en'):
    
        serializer = self.serializers[type]
        
        if type != "lda_model":
            with codecs.open(name, "r", encoding = "utf-8") as f:
                data = json.load(f)
         
        elif type == "lda_model":
            with open(name, "rb") as f:
                data = pickle.load(f)
            
        deserialized = serializer(data).deserialize()
        
        if type == "phrases":
            if language == 'en':
                common_terms = self.function_words_single
            else:
                common_terms = safe_get_stop_words(language)

            phrases = Phrases(delimiter="_", connector_words=common_terms)
            phrases.phrasegrams = deserialized
            deserialized = phrases        
        
        return deserialized
 def remove_stop_words(string, language):
     tokens = string.split()
     clean_tokens = [
         token for token in tokens
         if token not in safe_get_stop_words(language)
     ]
     return u' '.join(clean_tokens)
Beispiel #3
0
def get_document_json(url):
    """
    Parameters
    -------------
    url: str
        url of the document to be parsed.
    Returns
    -------------
    dict: document data.
    """
    article = Article(url)
    article.download()
    article.parse()
    article.nlp()
    if article.publish_date is None or isinstance(article.publish_date, str):
        date = None
    else:
        date = article.publish_date.strftime('%Y-%m-%d')
    if article.meta_lang != None and article.meta_lang != '':
        stopwords = safe_get_stop_words(article.meta_lang)
        keywords = [i for i in article.keywords if i not in stopwords]
    else:
        keywords = article.keywords
    keywords = list(set([slugify(i) for i in keywords]))
    json = {
        'title': article.title,
        'authors': article.authors,
        'created_on': date,
        'language': article.meta_lang,
        'keywords': keywords,
        'url': url,
    }
    return json
def make_word_cloud(df, ngram_min, ngram_max, name):
    my_path = os.path.abspath(os.path.dirname(__file__)) + '/static/wordcloud/'
    stop_words = safe_get_stop_words('en')
    filenames = []
    for ind, row in df.iterrows():
        data = row['review_text']
        num_words = 200
        ngram_range = (ngram_min, ngram_max)
        count_vectorizer = CountVectorizer(lowercase=True,
                                           stop_words=stop_words,
                                           ngram_range=ngram_range)
        counts = count_vectorizer.fit_transform(data)
        counts = counts.toarray().sum(axis=0)
        count_weighting = dict(
            zip(count_vectorizer.get_feature_names(), counts))
        count_weighting_df = pd.DataFrame.from_dict(count_weighting,
                                                    orient='index')
        count_weighting_df = count_weighting_df.reset_index(drop=False)
        count_weighting_df.columns = ['word', 'count']

        count_weighting_df = count_weighting_df.sort_values(['count'],
                                                            ascending=False)
        count_weighting_df = count_weighting_df.set_index('word')

        word_cloud_freq = count_weighting_df['count'].head(num_words).to_dict()
        wordcloud = WordCloud(
            collocations=False).generate_from_frequencies(word_cloud_freq)
        plotname = '{}_{}.png'.format(name, ind + 1)
        filenames.append(plotname)
        url = my_path + plotname
        fig = plt.figure(figsize=(10, 10))
        plt.imshow(wordcloud, cmap=plt.cm.bone, interpolation='bilinear')
        plt.axis("off")
        fig.savefig(url, transparent=True, bbox_inches='tight', pad_inches=0)
    return filenames
Beispiel #5
0
 def test_random_language_stop_words_load(self):
     languages = list(LANGUAGE_MAPPING.keys()) + list(AVAILABLE_LANGUAGES)
     sample = random.sample(languages, len(languages))
     for language in sample:
         stop_words = safe_get_stop_words(language)
         self.assertTrue(
             len(stop_words) > 0,
             'Cannot load stopwords for {0} language'.format(language))
Beispiel #6
0
 def test_random_language_stop_words_load(self):
     languages = list(LANGUAGE_MAPPING.keys()) + list(AVAILABLE_LANGUAGES)
     sample = random.sample(languages, len(languages))
     for language in sample:
         stop_words = safe_get_stop_words(language)
         self.assertTrue(
             len(stop_words) > 0,
             'Cannot load stopwords for {0} language'.format(language)
         )
def suitable_complex_word(w):
    """Checks if detected word is suitable for replacing."""
    # Not stopword or punctuation.
    not_stopword = w not in safe_get_stop_words(config.lang) and w.isalpha()
    # Not a simple word (above defined threshold).
    not_simple = zfreq(w, config.lang) < config.min_complexity
    # No uppercase (ensures NEs are not simplified).
    not_uppercase = w.islower()

    return not_stopword and not_simple and not_uppercase
Beispiel #8
0
def get_stopwords_by_lang(language):
    while True:
        try:
            stopwords = set(nltk.corpus.stopwords.words(language))
            return stopwords
        except LookupError as error:
            resource = re.search("nltk\\.download\\('(.+?)'\\)",
                                 str(error)).group(1)
            print(f'Downloading missing resource [{resource}]')
            nltk.download(resource)
        except IOError as error:
            stopwords = stop_words.safe_get_stop_words(language)
            return stopwords
Beispiel #9
0
def assemble_stopwords(languages = ['english'], user_defined = []):
    '''
    Supported Languages
        Arabic Catalan Danish Dutch
        English Finnish French German
        Hungarian Italian Norwegian Portuguese
        Romanian Russian Spanish Swedish
        Turkish Ukrainian
    '''
    sw = []
    if len(user_defined) > 0:
        sw += user_defined
    for i in languages:
        sw += safe_get_stop_words(i)
    return set(sw)
Beispiel #10
0
def remove_stopwords(text, text_language):
    tweet_words = text.lower().split()

    stop_words = safe_get_stop_words(
        languages.get(part1=text_language).name.lower()) + [
            "ul", "b", "v", "a", "z", "li", "o", "s", "k", "i", "se", "u003e",
            "u003c", "u", "href", "u003cli", "u003ca", "u003cul", "u003cb",
            "httpslinkcomdate", "u003edalší", "n"
        ]

    words = ""
    for word in tweet_words:
        if word not in stop_words:
            words = words + (" " + word)

    return words
Beispiel #11
0
    def _build_phrases(self, df, min_count = 1, language='en'):

        if language == 'en':
            common_terms = self.function_words_single
        else:
            common_terms = safe_get_stop_words(language)

        phrases = Phrases(
            sentences=stream_clean(df),
            min_count=min_count,
            threshold=0.70,
            scoring="npmi",
            max_vocab_size=20000000,
            delimiter="_",
            connector_words=common_terms
        )
        
        self.phrases = phrases
Beispiel #12
0
    def extract_dictionary_feature(self, check_str):
        """
        TODO
        * check that the string is actually a word (may not be necessary with different tokenisation)

        Example:
            >>> tests = [u'Hom.',u'Homér']
            >>> fe = FeatureExtractor()
            >>> [(tests[n],fe.feat_labels[fe.extract_dictionary_feature(t)[1]]) for n,t in enumerate(tests)]

        """
        feature_name = "n_works_dictionary"

        # compile a list of stopwords for all relevant languages
        languages = ["it", "de", "fr", "en", "es"]
        stopwords = []
        for lang in languages:
            stopwords += safe_get_stop_words(lang)

        if len(check_str) <= 2 or check_str.lower() in stopwords:
            # don't output dictionary feature for stopwords!
            return (feature_name, self.OTHERS)

        match_works = self.works_dict.lookup(check_str.encode("utf-8"))
        match_authors = self.authors_dict.lookup(check_str.encode("utf-8"))
        #result = (feature_name, self.OTHERS)

        if (len(match_authors) > 0):
            for key in match_authors:
                if (len(match_authors[key]) == len(check_str)):
                    result = (feature_name, self.MATCH_AUTHORS_DICT)
                else:
                    result = (feature_name, self.CONTAINED_AUTHORS_DICT)
        elif (len(match_works) > 0):
            for key in match_works:
                if (len(match_works[key]) == len(check_str)):
                    result = (feature_name, self.MATCH_WORKS_DICT)
                else:
                    result = (feature_name, self.CONTAINED_WORKS_DICT)
        else:
            result = (feature_name, self.OTHERS)
        return result
Beispiel #13
0
    def extract_dictionary_feature(self, check_str):
        """
        TODO
        * check that the string is actually a word (may not be necessary with different tokenisation)

        Example:
            >>> tests = [u'Hom.',u'Homér']
            >>> fe = FeatureExtractor()
            >>> [(tests[n],fe.feat_labels[fe.extract_dictionary_feature(t)[1]]) for n,t in enumerate(tests)]

        """
        feature_name = "n_works_dictionary"

        # compile a list of stopwords for all relevant languages
        languages = ["it", "de", "fr", "en", "es"]
        stopwords = []
        for lang in languages:
            stopwords += safe_get_stop_words(lang)

        if len(check_str) <= 2 or check_str.lower() in stopwords:
            # don't output dictionary feature for stopwords!
            return (feature_name, self.OTHERS)

        match_works = self.works_dict.lookup(check_str.encode("utf-8"))
        match_authors = self.authors_dict.lookup(check_str.encode("utf-8"))
        #result = (feature_name, self.OTHERS)

        if(len(match_authors) > 0):
            for key in match_authors:
                if(len(match_authors[key]) == len(check_str)):
                    result = (feature_name, self.MATCH_AUTHORS_DICT)
                else:
                    result = (feature_name, self.CONTAINED_AUTHORS_DICT)
        elif(len(match_works) > 0):
            for key in match_works:
                if(len(match_works[key]) == len(check_str)):
                    result = (feature_name, self.MATCH_WORKS_DICT)
                else:
                    result = (feature_name, self.CONTAINED_WORKS_DICT)
        else:
            result = (feature_name, self.OTHERS)
        return result
Beispiel #14
0
def suitable_candidate(w, c):
    """Checks if candidate is a suitable substitute based on
    various criteria."""
    source_stem = stemmer.stem(w)
    candidate_stem = stemmer.stem(c)

    # Check stem length.
    not_stem_len = not (len(candidate_stem) >= 3
                        and candidate_stem[:3] == source_stem[:3])
    # Not sharing stem with original word.
    not_equal_stem = source_stem != candidate_stem
    # Not punctuation
    not_punctuation = c.isalpha()

    # Other checks (disable when benchmarking).
    not_morph_deriv = c not in w and w not in c
    not_complex = zfreq(c, config.lang) > zfreq(w, config.lang)
    not_stopword = c not in safe_get_stop_words(config.lang) and c.isalpha()

    return not_equal_stem and not_stem_len and not_morph_deriv and not_stopword and not_complex
Beispiel #15
0
def get_document_json(post):
    """
    Parameters
    -------------
    post: dict
        post data.
    Returns
    -------------
    dict: document data.
    """
    try:
        article = Article(post['url'])
        article.download()
        article.parse()
        article.nlp()
        if article.publish_date is None or isinstance(article.publish_date, str):
            date = None
        else:
            date = article.publish_date.strftime('%Y-%m-%d')
        if article.meta_lang != None and article.meta_lang != '':
            stopwords = safe_get_stop_words(article.meta_lang)
            keywords = [i for i in article.keywords if i not in stopwords]
        else:
            keywords = article.keywords
        keywords = list(set([slugify(i) for i in keywords]))
        json = {
            'title': article.title,
            'authors': article.authors,
            'created_on': date,
            'language': article.meta_lang,
            'keywords': keywords,
            'url': post['url'],
        }
        if article.has_top_image() and post['image'] == MISSING_IMAGE:
            post['image'] = article.top_image
    except ArticleException:
        json = {
            'url': post['url']
        }
    return json
Beispiel #16
0
    def get_association(self, df, min_count=1, threshold=0.70, save_phraser=False, language='en'):
    
        cxg = C2xG(language = self.settings.MAP_THREE[language])
        association_df = cxg.get_association(self.read(df), freq_threshold = min_count, smoothing = False, lex_only = True)
        
        if save_phraser == True:
            if language == 'en':
                common_terms = self.function_words_single
            else:
                common_terms = safe_get_stop_words(language)

            phrasegrams = {}
            for row in association_df.itertuples():
                word = row[1] + "_" + row[2]
                if row[3] > threshold:
                    phrasegrams[word] = row[3]
        
            phrases = Phrases(delimiter="_", connector_words=common_terms, min_count=min_count, threshold=threshold)
            phrases.phrasegrams = phrasegrams
            self.phrases = phrases
            
        return association_df
def senti_values_csv(min_words):
    twitter_data = import_csv("ItalianTweets.csv")
    tweet_list = get_column(twitter_data, 6)
    tweet_sentiment = get_column(twitter_data, 2)
    size = len(tweet_list)
    for tweet in range(0, size):
        tweet_list[tweet] = del_twitter_words(tweet_list[tweet])
        tweet_list[tweet] = del_characters(tweet_list[tweet], '"!@#$%^&*()_-+=1234567890?<>|[]{}\/')

    # senti_dict = {}
    stopwords = stop_words.safe_get_stop_words('it')
    senti_words = create_list(tweet_list)
    senti_words = remove_duplicates(stopwords, senti_words, min_words)
    # print(len(senti_words))
    senti_words = translations(senti_words)
    senti_list = get_senti_values(senti_words)

    with open('Senti_Values.csv', 'wt') as senti_file:
        wr = csv.writer(senti_file, lineterminator = '\n', quoting=csv.QUOTE_ALL)
        wr.writerow(['Word', 'Pos', 'Neg'])
        length = len(senti_list)
        for pos in range(0, length):
            wr.writerow(senti_list[pos])
Beispiel #18
0
 def test_safe_get_stop_words(self):
     self.assertRaises(StopWordError, get_stop_words, 'huttese')
     self.assertEqual(safe_get_stop_words('huttese'), [])
def main(pieces, strength, min_words):
    twitter_data = import_csv("ItalianTweets.csv")
    tweet_list = get_column(twitter_data, 6)
    tweet_sentiment = get_column(twitter_data, 2)
    size = len(tweet_list)
    for tweet in range(0, size):
        tweet_list[tweet] = del_twitter_words(tweet_list[tweet])
        tweet_list[tweet] = del_characters(tweet_list[tweet], '"!@#$%^&*()_-+=1234567890?<>|[]{}\/')

    # senti_dict = {}
    stopwords = stop_words.safe_get_stop_words('it')
    senti_words = create_list(tweet_list)
    senti_words = remove_duplicates(stopwords, senti_words, min_words)
    senti_words = translations(senti_words)
    senti_dict = get_senti_values(senti_words)

    # pieces = 5
    train_and_test_data = randomize_tweets(tweet_list, tweet_sentiment, pieces)

    # total_data = [(1*strength) for x in range(width)]

    results = open('results.txt', 'w')
    results.write('Pieces = ' + str(pieces) + '\n')
    results.write('Min Words = ' + str(min_words) + '\n\n\n')
    results.close()

    for value in range(5, strength+1):
        final_train_results = [[] for x in range(5)]
        final_nb_results = [[] for x in range(5)]
        final_senti_results = [[] for x in range(5)]

        for piece in range(0, pieces):
            print(piece)
            train_data = []
            train_labels = []
            test_data = train_and_test_data[piece*2]
            test_labels = train_and_test_data[piece*2+1]
            for y in range(0,pieces):
                if piece != y:
                    train_data.extend(train_and_test_data[y*2])
                    train_labels.extend(train_and_test_data[y*2+1])
            train_words = create_list(train_data)
            train_words = remove_duplicates(stopwords, train_words, min_words)
            results = run_naive_bayes(train_data, train_labels, test_data, test_labels, train_words, stopwords, senti_dict, value)

            for x in range(0,5):
                final_train_results[x].append(results[0][x])
                final_nb_results[x].append(results[1][x])
                final_senti_results[x].append(results[2][x])

        accuracy = [0,0,0,0]
        positive_precision = [0,0,0,0]
        negative_precision = [0,0,0,0]
        positive_recall = [0,0,0,0]
        negative_recall = [0,0,0,0]

        for x in range(0,pieces):
            accuracy[0] += final_nb_results[0][x]
            accuracy[2] += final_senti_results[0][x]
            positive_precision[0] += final_nb_results[1][x]
            positive_precision[2] += final_senti_results[1][x]
            negative_precision[0] += final_nb_results[2][x]
            negative_precision[2] += final_senti_results[2][x]
            positive_recall[0] += final_nb_results[3][x]
            positive_recall[2] += final_senti_results[3][x]
            negative_recall[0] += final_nb_results[4][x]
            negative_recall[2] += final_senti_results[4][x]

        accuracy[0] /= float(pieces)
        accuracy[2] /= float(pieces)
        positive_precision[0] /= float(pieces)
        positive_precision[2] /= float(pieces)
        negative_precision[0] /= float(pieces)
        negative_precision[2] /= float(pieces)
        positive_recall[0] /= float(pieces)
        positive_recall[2] /= float(pieces)
        negative_recall[0] /= float(pieces)
        negative_recall[2] /= float(pieces)

        for y in range(0,pieces):
            accuracy[1] += (accuracy[0] - final_nb_results[0][y]) ** 2
            accuracy[3] += (accuracy[2] - final_senti_results[0][y]) ** 2
            positive_precision[1] += (positive_precision[0] - final_nb_results[1][y]) ** 2
            positive_precision[3] += (positive_precision[2] - final_senti_results[1][y]) ** 2
            negative_precision[1] += (negative_precision[0] - final_nb_results[2][y]) ** 2
            negative_precision[3] += (negative_precision[2] - final_senti_results[2][y]) ** 2
            positive_recall[1] += (positive_recall[0] - final_nb_results[3][y]) ** 2
            positive_recall[3] += (positive_recall[2] - final_senti_results[3][y]) ** 2
            negative_recall[1] += (negative_recall[0] - final_nb_results[4][y]) ** 2
            negative_recall[3] += (negative_recall[2] - final_senti_results[4][y]) ** 2

        accuracy[1] = (accuracy[1]/float(pieces)) ** 0.5
        accuracy[3] = (accuracy[3]/float(pieces)) ** 0.5
        positive_precision[1] = (positive_precision[1]/float(pieces)) ** 0.5
        positive_precision[3] = (positive_precision[3]/float(pieces)) ** 0.5
        negative_precision[1] = (negative_precision[1]/float(pieces)) ** 0.5
        negative_precision[3] = (negative_precision[3]/float(pieces)) ** 0.5
        positive_recall[1] = (positive_recall[1]/float(pieces)) ** 0.5
        positive_recall[3] = (positive_recall[3]/float(pieces)) ** 0.5
        negative_recall[1] = (negative_recall[1]/float(pieces)) ** 0.5
        negative_recall[3] = (negative_recall[3]/float(pieces)) ** 0.5

            # final_train_results[x] = float(final_train_results[x])/pieces
            # final_nb_results[x] = float(final_nb_results[x])/pieces
            # final_senti_results[x] = float(final_senti_results[x])/pieces
            # print(final_train_results[x])
            # print(final_nb_results[x])
            # print(final_senti_results[x])
            # print()

        results = open('results.txt', 'a')

        results.write('Strength = ' + str(value) + '\n\n')

        # results.write('Training Data' + '\n')
        # results.write('Accuracy:           ' + str(final_train_results[0]) + '\n')
        # results.write('Positive Precision: ' + str(final_train_results[1]) + '\n')
        # results.write('Negative Precision: ' + str(final_train_results[2]) + '\n')
        # results.write('Positive Recall     ' + str(final_train_results[3]) + '\n')
        # results.write('Negative Recall     ' + str(final_train_results[4]) + '\n')
        # results.write('\n')

        results.write('Naive Bayes' + '\n')
        results.write('Accuracy:           ' + str(accuracy[0]) + '    ' + str(accuracy[1]) + '\n')
        results.write('Positive Precision: ' + str(positive_precision[0]) + '    ' + str(positive_precision[1]) + '\n')
        results.write('Negative Precision: ' + str(negative_precision[0]) + '    ' + str(negative_precision[1]) + '\n')
        results.write('Positive Recall     ' + str(positive_recall[0]) + '    ' + str(positive_recall[1]) + '\n')
        results.write('Negative Recall     ' + str(negative_recall[0]) + '    ' + str(negative_recall[1]) + '\n')
        results.write('\n')

        results.write('SentiWordNet' + '\n')
        results.write('Accuracy:           ' + str(accuracy[2]) + '    ' + str(accuracy[3]) + '\n')
        results.write('Positive Precision: ' + str(positive_precision[2]) + '    ' + str(positive_precision[3]) + '\n')
        results.write('Negative Precision: ' + str(negative_precision[2]) + '    ' + str(negative_precision[3]) + '\n')
        results.write('Positive Recall:     ' + str(positive_recall[2]) + '    ' + str(positive_recall[3]) + '\n')
        results.write('Negative Recall:     ' + str(negative_recall[2]) + '    ' + str(negative_recall[3]) + '\n')
        results.write('\n\n')

        results.close()
    json_paths = []
if arguments.config:
    with open(arguments.config, 'r', encoding='utf-8') as f:
        line = f.readline()
        while line:
            json_paths.append(line.replace('\n', ''))
            line = f.readline()
if arguments.json_paths:
    for x in arguments.json_paths:
        json_paths.append(x)
print(json_paths)
p = arguments.print
generate_wordcloud = arguments.word_cloud
if generate_wordcloud:
    from stop_words import safe_get_stop_words
    stop_words = safe_get_stop_words('de')
    for word in custom_stop_words:
        stop_words.append(word)
    print('Stop words:')
    print(stop_words)
    if arguments.image:
        import numpy as np
        from PIL import Image
        wordcloud_mask = np.array(Image.open(arguments.image))
        use_mask = True
    else:
        wordcloud_mask = None
        use_mask = False

wordcloud_users = arguments.word_cloud_users
if arguments.starting_time is not None:
Beispiel #21
0
    def _compute_tfidf_matrix(self, base_dir=None):
        LOGGER.info('Computing TF-IDF matrix (base_dir={})'.format(base_dir))
        tfidf_data = {}

        # Compute tf-idf distribution for each language
        for lang in LANGUAGES:
            lang_data = {}

            if not base_dir:
                resources_dir = 'data/wikipages/text/authors/{}'.format(lang)
                text_authors_dir_lang = pkg_resources.resource_filename(
                    'citation_extractor',
                    resources_dir
                )
                text_authors_files = pkg_resources.resource_listdir(
                    'citation_extractor',
                    resources_dir
                )
            else:
                text_authors_dir_lang = os.path.join(base_dir, lang)
                text_authors_files = os.listdir(text_authors_dir_lang)

            LOGGER.info('Computing TF-IDF matrix: using %i document for \
                        language %s' % (len(text_authors_files), lang))

            texts = []
            urn_to_index = {}
            index = 0
            for file in text_authors_files:
                if not file.endswith('.txt'):
                    continue

                urn = file.replace('.txt', '')
                filepath = os.path.join(text_authors_dir_lang, file)
                with open(filepath) as txt_file:
                    text = txt_file.read()
                texts.append(text)
                urn_to_index[urn] = index
                index += 1

            # Dictionary mapping a URN to an index (row)
            lang_data['urn_to_index'] = urn_to_index

            tfidf_vectorizer = TfidfVectorizer(
                input='content',
                strip_accents='unicode',
                analyzer='word',
                stop_words=safe_get_stop_words(lang)
            )

            # Language-specific vectorizer
            lang_data['vectorizer'] = tfidf_vectorizer

            # Tf-idf matrix computed with the specific vectorizer
            tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
            lang_data['matrix'] = tfidf_matrix

            tfidf_data[lang] = lang_data
            LOGGER.info('Done computing TF-IDF matrix.')

        return tfidf_data
Beispiel #22
0
    def get_results(self, segment=None):
        key = PollQuestion.POLL_QUESTION_RESULTS_CACHE_KEY % (self.poll.org.pk, self.poll.pk, self.pk)
        if segment:
            substituted_segment = self.poll.org.substitute_segment(segment)
            key += ":" + slugify(unicode(json.dumps(substituted_segment)))

        cached_value = cache.get(key, None)
        if cached_value:
            return cached_value["results"]

        org = self.poll.org
        open_ended = self.is_open_ended()
        responded = self.get_responded()
        polled = self.get_polled()

        results = []

        if open_ended and not segment:
            cursor = connection.cursor()

            custom_sql = """
                      SELECT w.label, count(*) AS count FROM (SELECT regexp_split_to_table(LOWER(text), E'[^[:alnum:]_]') AS label FROM polls_pollresult WHERE polls_pollresult.org_id = %d AND polls_pollresult.flow = '%s' AND polls_pollresult.ruleset = '%s') w group by w.label order by count desc;
                      """ % (org.id, self.poll.flow_uuid, self.ruleset_uuid)

            cursor.execute(custom_sql)
            from ureport.utils import get_dict_from_cursor
            unclean_categories = get_dict_from_cursor(cursor)
            categories = []

            ureport_languages = getattr(settings, 'LANGUAGES', [('en', 'English')])

            org_languages = [lang[1].lower() for lang in ureport_languages if lang[0] == org.language]

            if 'english' not in org_languages:
                org_languages.append('english')

            ignore_words = []
            for lang in org_languages:
                ignore_words += safe_get_stop_words(lang)

            categories = []

            for category in unclean_categories:
                if len(category['label']) > 1 and category['label'] not in ignore_words and len(categories) < 100:
                    categories.append(dict(label=category['label'], count=int(category['count'])))

            # sort by count, then alphabetically
            categories = sorted(categories, key=lambda c: (-c['count'], c['label']))
            results.append(dict(open_ended=open_ended, set=responded, unset=polled-responded, categories=categories))

        else:
            categories_label = self.response_categories.filter(is_active=True).values_list('category', flat=True)
            question_results = self.get_question_results()

            if segment:

                location_part = segment.get('location').lower()

                if location_part not in ['state', 'district']:
                    return None

                location_boundaries = org.get_segment_org_boundaries(segment)

                for boundary in location_boundaries:
                    categories = []
                    osm_id = boundary.get('osm_id').upper()
                    set_count = 0
                    unset_count_key = "ruleset:%s:nocategory:%s:%s" % (self.ruleset_uuid, location_part, osm_id)
                    unset_count = question_results.get(unset_count_key, 0)

                    for categorie_label in categories_label:
                        category_count_key = "ruleset:%s:category:%s:%s:%s" % (self.ruleset_uuid, categorie_label.lower(), location_part, osm_id)
                        category_count = question_results.get(category_count_key, 0)
                        set_count += category_count
                        categories.append(dict(count=category_count, label=categorie_label))

                    if open_ended:
                        # For home page best and worst location responses
                        from ureport.contacts.models import Contact
                        if segment.get('location') == 'District':
                            boundary_contacts_count = Contact.objects.filter(org=org, district=osm_id).count()
                        else:
                            boundary_contacts_count = Contact.objects.filter(org=org, state=osm_id).count()
                        unset_count = boundary_contacts_count - set_count

                    results.append(dict(open_ended=open_ended, set=set_count, unset=unset_count,
                                        boundary=osm_id, label=boundary.get('name'),
                                        categories=categories))

            else:
                categories = []
                for categorie_label in categories_label:
                    category_count_key = "ruleset:%s:category:%s" % (self.ruleset_uuid, categorie_label.lower())
                    if categorie_label.lower() != 'other':
                        category_count = question_results.get(category_count_key, 0)
                        categories.append(dict(count=category_count, label=categorie_label))

                results.append(dict(open_ended=open_ended, set=responded, unset=polled-responded, categories=categories))

        cache.set(key, {"results": results}, PollQuestion.POLL_QUESTION_RESULTS_CACHE_TIMEOUT)

        return results
Beispiel #23
0
 def test_safe_get_stop_words(self):
     self.assertRaises(StopWordError, get_stop_words, 'huttese')
     self.assertEqual(safe_get_stop_words('huttese'), [])
Beispiel #24
0
 def __init__(self):
     logging.basicConfig(level=logging.INFO)
     self.logger = logging.getLogger(__name__)
     self.stop_words = safe_get_stop_words('german')
     print(self.stop_words)
Beispiel #25
0
    def get_value_summary(cls, ruleset=None, contact_field=None, filters=None, segment=None):
        """
        Returns the results for the passed in ruleset or contact field given the passed in filters and segments.

        Filters are expected in the following formats:
            { field: rulesetId, categories: ["Red", "Blue", "Yellow"] }

        Segments are expected in these formats instead:
            { ruleset: 1515, categories: ["Red", "Blue"] }  // segmenting by another field, for those categories
            { groups: 124,151,151 }                         // segment by each each group in the passed in ids
            { location: "State", parent: null }             // segment for each admin boundary within the parent
            { contact_field: "Country", values: ["US", "EN", "RW"] } // segment by a contact field for these values
        """
        from temba.contacts.models import ContactGroup, ContactField
        from temba.flows.models import TrueTest, RuleSet

        start = time.time()
        results = []

        if (not ruleset and not contact_field) or (ruleset and contact_field):
            raise ValueError("Must specify either a RuleSet or Contact field.")

        org = ruleset.flow.org if ruleset else contact_field.org

        open_ended = ruleset and ruleset.ruleset_type == RuleSet.TYPE_WAIT_MESSAGE and len(ruleset.get_rules()) == 1

        # default our filters to an empty list if None are passed in
        if filters is None:
            filters = []

        # build the kwargs for our subcall
        kwargs = dict(ruleset=ruleset, contact_field=contact_field, filters=filters)

        # this is our list of dependencies, that is things that will blow away our results
        dependencies = set()
        fingerprint_dict = dict(filters=filters, segment=segment)
        if ruleset:
            fingerprint_dict['ruleset'] = ruleset.id
            dependencies.add(RULESET_KEY % ruleset.id)
        if contact_field:
            fingerprint_dict['contact_field'] = contact_field.id
            dependencies.add(CONTACT_KEY % contact_field.id)

        for contact_filter in filters:
            if 'ruleset' in contact_filter:
                dependencies.add(RULESET_KEY % contact_filter['ruleset'])
            if 'groups' in contact_filter:
                for group_id in contact_filter['groups']:
                    dependencies.add(GROUP_KEY % group_id)
            if 'location' in contact_filter:
                field = ContactField.get_by_label(org, contact_filter['location'])
                dependencies.add(CONTACT_KEY % field.id)

        if segment:
            if 'ruleset' in segment:
                dependencies.add(RULESET_KEY % segment['ruleset'])
            if 'groups' in segment:
                for group_id in segment['groups']:
                    dependencies.add(GROUP_KEY % group_id)
            if 'location' in segment:
                field = ContactField.get_by_label(org, segment['location'])
                dependencies.add(CONTACT_KEY % field.id)

        # our final redis key will contain each dependency as well as a HASH representing the fingerprint of the
        # kwargs passed to this method, generate that hash
        fingerprint = hash(dict_to_json(fingerprint_dict))

        # generate our key
        key = VALUE_SUMMARY_CACHE_KEY + ":" + str(org.id) + ":".join(sorted(list(dependencies))) + ":" + str(fingerprint)

        # does our value exist?
        r = get_redis_connection()
        cached = r.get(key)

        if cached is not None:
            try:
                return json_to_dict(cached)
            except Exception:
                # failed decoding, oh well, go calculate it instead
                pass

        if segment:
            # segmenting a result is the same as calculating the result with the addition of each
            # category as a filter so we expand upon the passed in filters to do this
            if 'ruleset' in segment and 'categories' in segment:
                for category in segment['categories']:
                    category_filter = list(filters)
                    category_filter.append(dict(ruleset=segment['ruleset'], categories=[category]))

                    # calculate our results for this segment
                    kwargs['filters'] = category_filter
                    (set_count, unset_count, categories) = cls.get_filtered_value_summary(**kwargs)
                    results.append(dict(label=category, open_ended=open_ended, set=set_count, unset=unset_count, categories=categories))

            # segmenting by groups instead, same principle but we add group filters
            elif 'groups' in segment:
                for group_id in segment['groups']:
                    # load our group
                    group = ContactGroup.user_groups.get(org=org, pk=group_id)

                    category_filter = list(filters)
                    category_filter.append(dict(groups=[group_id]))

                    # calculate our results for this segment
                    kwargs['filters'] = category_filter
                    (set_count, unset_count, categories) = cls.get_filtered_value_summary(**kwargs)
                    results.append(dict(label=group.name, open_ended=open_ended, set=set_count, unset_count=unset_count, categories=categories))

            # segmenting by a contact field, only for passed in categories
            elif 'contact_field' in segment and 'values' in segment:
                # look up the contact field
                field = ContactField.get_by_label(org, segment['contact_field'])

                for value in segment['values']:
                    value_filter = list(filters)
                    value_filter.append(dict(contact_field=field.pk, values=[value]))

                    # calculate our results for this segment
                    kwargs['filters'] = value_filter
                    (set_count, unset_count, categories) = cls.get_filtered_value_summary(**kwargs)
                    results.append(dict(label=value, open_ended=open_ended, set=set_count, unset=unset_count, categories=categories))

            # segmenting by a location field
            elif 'location' in segment:
                # look up the contact field
                field = ContactField.get_by_label(org, segment['location'])

                # make sure they are segmenting on a location type that makes sense
                if field.value_type not in [Value.TYPE_STATE, Value.TYPE_DISTRICT, Value.TYPE_WARD]:
                    raise ValueError(_("Cannot segment on location for field that is not a State or District type"))

                # make sure our org has a country for location based responses
                if not org.country:
                    raise ValueError(_("Cannot segment by location until country has been selected for organization"))

                # the boundaries we will segment by
                parent = org.country

                # figure out our parent
                parent_osm_id = segment.get('parent', None)
                if parent_osm_id:
                    parent = AdminBoundary.objects.get(osm_id=parent_osm_id)

                # get all the boundaries we are segmenting on
                boundaries = list(AdminBoundary.objects.filter(parent=parent).order_by('name'))

                # if the field is a district field, they need to specify the parent state
                if not parent_osm_id and field.value_type == Value.TYPE_DISTRICT:
                    raise ValueError(_("You must specify a parent state to segment results by district"))

                if not parent_osm_id and field.value_type == Value.TYPE_WARD:
                    raise ValueError(_("You must specify a parent state to segment results by ward"))

                # if this is a district, we can speed things up by only including those districts in our parent, build
                # the filter for that
                if parent and field.value_type in [Value.TYPE_DISTRICT, Value.TYPE_WARD]:
                    location_filters = [filters, dict(location=field.pk, boundary=[b.osm_id for b in boundaries])]
                else:
                    location_filters = filters

                # get all the contacts segment by location first
                (location_set_contacts, location_unset_contacts, location_results) = \
                    cls.get_filtered_value_summary(contact_field=field, filters=location_filters, return_contacts=True)

                # now get the contacts for our primary query
                kwargs['return_contacts'] = True
                kwargs['filter_contacts'] = location_set_contacts
                (primary_set_contacts, primary_unset_contacts, primary_results) = cls.get_filtered_value_summary(**kwargs)

                # build a map of osm_id to location_result
                osm_results = {lr['label']: lr for lr in location_results}
                empty_result = dict(contacts=list())

                for boundary in boundaries:
                    location_result = osm_results.get(boundary.osm_id, empty_result)

                    # clone our primary results
                    segmented_results = dict(label=boundary.name,
                                             boundary=boundary.osm_id,
                                             open_ended=open_ended)

                    location_categories = list()
                    location_contacts = set(location_result['contacts'])

                    for category in primary_results:
                        category_contacts = set(category['contacts'])

                        intersection = location_contacts & category_contacts
                        location_categories.append(dict(label=category['label'], count=len(intersection)))

                    segmented_results['set'] = len(location_contacts & primary_set_contacts)
                    segmented_results['unset'] = len(location_contacts & primary_unset_contacts)
                    segmented_results['categories'] = location_categories
                    results.append(segmented_results)

                results = sorted(results, key=lambda r: r['label'])

        else:
            (set_count, unset_count, categories) = cls.get_filtered_value_summary(**kwargs)

            # Check we have and we have an OPEN ENDED ruleset
            if ruleset and len(ruleset.get_rules()) == 1 and isinstance(ruleset.get_rules()[0].test, TrueTest):
                cursor = connection.cursor()

                custom_sql = """SELECT w.label, count(*) AS count FROM (
                    SELECT
                      regexp_split_to_table(LOWER(text), E'[^[:alnum:]_]') AS label
                    FROM msgs_msg INNER JOIN contacts_contact ON ( msgs_msg.contact_id = contacts_contact.id )
                    WHERE msgs_msg.id IN (
                      SELECT
                        msg_id
                        FROM flows_flowstep_messages, flows_flowstep
                        WHERE flowstep_id = flows_flowstep.id AND
                        flows_flowstep.step_uuid = '%s'
                      ) AND contacts_contact.is_test = False
                  ) w group by w.label order by count desc;""" % ruleset.uuid

                cursor.execute(custom_sql)
                unclean_categories = get_dict_from_cursor(cursor)
                categories = []

                org_languages = [lang.name.lower() for lang in org.languages.filter(orgs=None).distinct()]

                if 'english' not in org_languages:
                    org_languages.append('english')

                ignore_words = []
                for lang in org_languages:
                    ignore_words += safe_get_stop_words(lang)

                for category in unclean_categories:
                    if len(category['label']) > 1 and category['label'] not in ignore_words and len(categories) < 100:
                        categories.append(dict(label=category['label'], count=int(category['count'])))

                # sort by count, then alphabetically
                categories = sorted(categories, key=lambda c: (-c['count'], c['label']))

            results.append(dict(label=unicode(_("All")), open_ended=open_ended, set=set_count, unset=unset_count, categories=categories))

        # for each of our dependencies, add our key as something that depends on it
        pipe = r.pipeline()
        for dependency in dependencies:
            pipe.sadd(dependency, key)
            pipe.expire(dependency, VALUE_SUMMARY_CACHE_TIME)

        # and finally set our result
        pipe.set(key, dict_to_json(results), VALUE_SUMMARY_CACHE_TIME)
        pipe.execute()

        # leave me: nice for profiling..
        #from django.db import connection as db_connection, reset_queries
        #print "=" * 80
        #for query in db_connection.queries:
        #    print "%s - %s" % (query['time'], query['sql'][:1000])
        #print "-" * 80
        #print "took: %f" % (time.time() - start)
        #print "=" * 80
        #reset_queries()

        return results
Beispiel #26
0
    def get_results(self, segment=None):
        key = PollQuestion.POLL_QUESTION_RESULTS_CACHE_KEY % (
            self.poll.org.pk, self.poll.pk, self.pk)
        if segment:
            substituted_segment = self.poll.org.substitute_segment(segment)
            key += ":" + slugify(unicode(json.dumps(substituted_segment)))

        cached_value = cache.get(key, None)
        if cached_value:
            return cached_value["results"]

        org = self.poll.org
        open_ended = self.is_open_ended()
        responded = self.get_responded()
        polled = self.get_polled()

        results = []

        if open_ended and not segment:
            cursor = connection.cursor()

            custom_sql = """
                      SELECT w.label, count(*) AS count FROM (SELECT regexp_split_to_table(LOWER(text), E'[^[:alnum:]_]') AS label FROM polls_pollresult WHERE polls_pollresult.org_id = %d AND polls_pollresult.flow = '%s' AND polls_pollresult.ruleset = '%s') w group by w.label order by count desc;
                      """ % (org.id, self.poll.flow_uuid, self.ruleset_uuid)

            cursor.execute(custom_sql)
            from ureport.utils import get_dict_from_cursor
            unclean_categories = get_dict_from_cursor(cursor)
            categories = []

            ureport_languages = getattr(settings, 'LANGUAGES',
                                        [('en', 'English')])

            org_languages = [
                lang[1].lower() for lang in ureport_languages
                if lang[0] == org.language
            ]

            if 'english' not in org_languages:
                org_languages.append('english')

            ignore_words = []
            for lang in org_languages:
                ignore_words += safe_get_stop_words(lang)

            categories = []

            for category in unclean_categories:
                if len(category['label']) > 1 and category[
                        'label'] not in ignore_words and len(categories) < 100:
                    categories.append(
                        dict(label=category['label'],
                             count=int(category['count'])))

            # sort by count, then alphabetically
            categories = sorted(categories,
                                key=lambda c: (-c['count'], c['label']))
            results.append(
                dict(open_ended=open_ended,
                     set=responded,
                     unset=polled - responded,
                     categories=categories))

        else:
            categories_label = self.response_categories.filter(
                is_active=True).values_list('category', flat=True)
            question_results = self.get_question_results()

            if segment:

                location_part = segment.get('location').lower()

                if location_part not in ['state', 'district']:
                    return None

                location_boundaries = org.get_segment_org_boundaries(segment)

                for boundary in location_boundaries:
                    categories = []
                    osm_id = boundary.get('osm_id').upper()
                    set_count = 0
                    unset_count_key = "ruleset:%s:nocategory:%s:%s" % (
                        self.ruleset_uuid, location_part, osm_id)
                    unset_count = question_results.get(unset_count_key, 0)

                    for categorie_label in categories_label:
                        category_count_key = "ruleset:%s:category:%s:%s:%s" % (
                            self.ruleset_uuid, categorie_label.lower(),
                            location_part, osm_id)
                        category_count = question_results.get(
                            category_count_key, 0)
                        set_count += category_count
                        categories.append(
                            dict(count=category_count, label=categorie_label))

                    if open_ended:
                        # For home page best and worst location responses
                        from ureport.contacts.models import Contact
                        if segment.get('location') == 'District':
                            boundary_contacts_count = Contact.objects.filter(
                                org=org, district=osm_id).count()
                        else:
                            boundary_contacts_count = Contact.objects.filter(
                                org=org, state=osm_id).count()
                        unset_count = boundary_contacts_count - set_count

                    results.append(
                        dict(open_ended=open_ended,
                             set=set_count,
                             unset=unset_count,
                             boundary=osm_id,
                             label=boundary.get('name'),
                             categories=categories))

            else:
                categories = []
                for categorie_label in categories_label:
                    category_count_key = "ruleset:%s:category:%s" % (
                        self.ruleset_uuid, categorie_label.lower())
                    if categorie_label.lower() != 'other':
                        category_count = question_results.get(
                            category_count_key, 0)
                        categories.append(
                            dict(count=category_count, label=categorie_label))

                results.append(
                    dict(open_ended=open_ended,
                         set=responded,
                         unset=polled - responded,
                         categories=categories))

        cache.set(key, {"results": results},
                  PollQuestion.POLL_QUESTION_RESULTS_CACHE_TIMEOUT)

        return results
Beispiel #27
0
documents = []
for i in range(len(text_files)):
#	store = open("store.html", 'w')
	soup = BeautifulSoup(open(text_files[i], encoding="utf8"), "lxml").get_text()
#	store.write(soup)
	new = open(text_files[i], 'w', encoding="utf8")
	new.write(soup)
	new.close()
	
documents = [open(f, encoding="utf8").read() for f in text_files]
print (len(text_files))


#set stop_words
stop_words_1 = safe_get_stop_words('unsupported language') + get_stop_words('en') + get_stop_words('english') + ['DOCTYPE', 'html', 'PUBLIC', 'head', 'meta', 'http', 'content', 'link', 'rel', 'href', 'title', 'style', 'type', 'import', 'media', 'script', 'javascript', 'src', 'body', 'div', 'class', 'id', 'name', 'a', 'h3', 'h1', 'h2', 'table', 'tr', 'td', 'p', 'small', 'span', 'b', 'font', 'li', 'articles', 'wikipedia', 'text', 'css', 'org', 'th', 'skins', 'width', 'en', 'wiki', r'\d+\w+']
vectorizer = CountVectorizer()
vectorizer = CountVectorizer(stop_words=stop_words_1)


#get tf_idf matrix and transform to 2D list
tfidf_matrix = vectorizer.fit_transform(documents)
#print (tfidf_vectorizer.vocabulary_)
b = tfidf_matrix.todense().tolist()
a = preprocessing.normalize(b, norm='l2')
print ((a[0]))
for i in range(len(a)):
	print (max((a[i])))

#calculate one_norm value and sort documents
one_norm = []
# nltk.download('wordnet')
# nltk.download('stopwords')

warnings.filterwarnings("ignore")

sys.stdout = open("./output/disaster_output.txt", "w")

plt.style.use('ggplot')

nlp = spacy.load('en_core_web_sm')
deselect_stop_words = ['no', 'not']  # we don't consider no and not stop words
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False

lemmatizer = WordNetLemmatizer()
stop_words = safe_get_stop_words('en')
hashtag_regex = re.compile(r"\#\b[\w\-\_]+\b")
twitter_segmenter = Segmenter(corpus="twitter_2018")
camelcase_regex = re.compile(
    r'((?<=[a-z])[A-Z]|(?<!^)[A-Z](?=[a-z])|[0-9]+|(?<=[0-9\-\_])[A-Za-z]|[\-\_])'
)


# DATA PRE-PROCESSING FUNCTIONS
def unescape_tweet(tweet):
    """Unescaping various chars found in text """
    return html.unescape(tweet)


def strip_html_tags(text):
    """remove html tags from text"""
Beispiel #29
0
    def calculate_results(self, segment=None):

        org = self.poll.org
        open_ended = self.is_open_ended()
        responded = self.get_responded()
        polled = self.get_polled()

        results = []

        if open_ended and not segment:
            custom_sql = """
                      SELECT w.label, count(*) AS count FROM (SELECT regexp_split_to_table(LOWER(text), E'[^[:alnum:]_]') AS label FROM polls_pollresult WHERE polls_pollresult.org_id = %d AND polls_pollresult.flow = '%s' AND polls_pollresult.ruleset = '%s' AND polls_pollresult.text IS NOT NULL AND polls_pollresult.text NOT ILIKE '%s') w group by w.label;
                      """ % (org.id, self.poll.flow_uuid, self.ruleset_uuid, "http%")
            with connection.cursor() as cursor:
                cursor.execute(custom_sql)
                from ureport.utils import get_dict_from_cursor
                unclean_categories = get_dict_from_cursor(cursor)

            ureport_languages = getattr(settings, 'LANGUAGES', [('en', 'English')])

            org_languages = [lang[1].lower() for lang in ureport_languages if lang[0] == org.language]

            if 'english' not in org_languages:
                org_languages.append('english')

            ignore_words = []
            for lang in org_languages:
                ignore_words += safe_get_stop_words(lang)

            categories = []

            for category in unclean_categories:
                if len(category['label']) > 1 and category['label'] not in ignore_words and len(categories) < 100:
                    categories.append(dict(label=category['label'], count=int(category['count'])))

            # sort by count, then alphabetically
            categories = sorted(categories, key=lambda c: (-c['count'], c['label']))
            results.append(dict(open_ended=open_ended, set=responded, unset=polled-responded, categories=categories))

        else:
            categories_label = self.response_categories.filter(is_active=True).values_list('category', flat=True)
            question_results = self.get_question_results()

            if segment:

                location_part = segment.get('location', '').lower()
                age_part = segment.get('age', '').lower()
                gender_part = segment.get('gender', '').lower()

                if location_part in ['state', 'district', 'ward']:

                    location_boundaries = org.get_segment_org_boundaries(segment)

                    for boundary in location_boundaries:
                        categories = []
                        osm_id = boundary.get('osm_id').upper()
                        set_count = 0
                        unset_count_key = "ruleset:%s:nocategory:%s:%s" % (self.ruleset_uuid, location_part, osm_id)
                        unset_count = question_results.get(unset_count_key, 0)

                        for categorie_label in categories_label:
                            category_count_key = "ruleset:%s:category:%s:%s:%s" % (self.ruleset_uuid, categorie_label.lower(), location_part, osm_id)
                            category_count = question_results.get(category_count_key, 0)
                            set_count += category_count
                            categories.append(dict(count=category_count, label=categorie_label))

                        results.append(dict(open_ended=open_ended, set=set_count, unset=unset_count,
                                            boundary=osm_id, label=boundary.get('name'),
                                            categories=categories))
                elif age_part:
                    poll_year = self.poll.poll_date.year

                    born_results = {k: v for k, v in question_results.iteritems() if k[-9:-5] == 'born'}

                    age_intervals = dict()
                    age_intervals['35+'] = (35, 2000)
                    age_intervals['31-34'] = (31, 34)
                    age_intervals['25-30'] = (25, 30)
                    age_intervals['20-24'] = (20, 24)
                    age_intervals['15-19'] = (15, 19)
                    age_intervals['0-14'] = (0, 14)

                    for age_group in age_intervals.keys():
                        lower_bound, upper_bound = age_intervals[age_group]
                        unset_count = 0

                        categories_count = dict()
                        for categorie_label in categories_label:
                            if categorie_label.lower() != 'other':
                                categories_count[categorie_label.lower()] = 0

                        for result_key, result_count in born_results.iteritems():
                            age = poll_year - int(result_key[-4:])

                            if lower_bound <= age < upper_bound:
                                if 'nocategory' in result_key:
                                    unset_count += result_count

                                for categorie_label in categories_label:
                                    if categorie_label.lower() != 'other':
                                        if result_key.startswith('ruleset:%s:category:%s:' % (self.ruleset_uuid, categorie_label.lower())):
                                            categories_count[categorie_label.lower()] += result_count

                        categories = [dict(count=v, label=k) for k, v in categories_count.iteritems()]

                        set_count = sum([elt['count'] for elt in categories])

                        results.append(dict(set=set_count, unset=unset_count, label=age_group,
                                            categories=categories))

                    results = sorted(results, key=lambda i:i['label'])

                elif gender_part:

                    genders = ['f', 'm']
                    gender_labels = dict(f=_('Female'), m=_('Male'))

                    for gender in genders:
                        categories = []
                        set_count = 0
                        unset_count_key = "ruleset:%s:nocategory:%s:%s"% (self.ruleset_uuid, 'gender', gender)
                        unset_count = question_results.get(unset_count_key, 0)

                        for categorie_label in categories_label:
                            category_count_key = "ruleset:%s:category:%s:%s:%s" % (self.ruleset_uuid, categorie_label.lower(), 'gender', gender)
                            if categorie_label.lower() != 'other':
                                category_count = question_results.get(category_count_key, 0)
                                set_count += category_count
                                categories.append(dict(count=category_count, label=categorie_label))

                        results.append(dict(set=set_count, unset=unset_count, label=gender_labels.get(gender),
                                            categories=categories))

            else:
                categories = []
                for categorie_label in categories_label:
                    category_count_key = "ruleset:%s:category:%s" % (self.ruleset_uuid, categorie_label.lower())
                    if categorie_label.lower() != 'other':
                        category_count = question_results.get(category_count_key, 0)
                        categories.append(dict(count=category_count, label=categorie_label))

                results.append(dict(open_ended=open_ended, set=responded, unset=polled-responded, categories=categories))

        cache_time = PollQuestion.POLL_QUESTION_RESULTS_CACHE_TIMEOUT
        if not segment:
            cache_time = None

        if segment and segment.get('location', '').lower() == 'state':
            cache_time = None

        if segment and segment.get('age', '').lower() == 'age':
            cache_time = None

        if segment and segment.get('gender', '').lower() == 'gender':
            cache_time = None

        key = PollQuestion.POLL_QUESTION_RESULTS_CACHE_KEY % (self.poll.org.pk, self.poll.pk, self.pk)
        if segment:
            substituted_segment = self.poll.org.substitute_segment(segment)
            key += ":" + slugify(unicode(json.dumps(substituted_segment)))

        cache.set(key, {"results": results}, cache_time)

        return results
Beispiel #30
0
tokenize.pattern = re.compile("\W+")


def delete_stop_words(query):
    """
    Удаление стоп-слов
    :param query: исходный список слов
    :return: список с удалёнными стоп-словами
    """
    return (token for token in query
            if token not in delete_stop_words.stop_words_set)


delete_stop_words.stop_words_set = stop_words_set = set(
    safe_get_stop_words("ru") + safe_get_stop_words("en"))


def stem(word):
    """
    Стемминг слова
    :param word: исходное слово
    :return: основа слова
    """
    return min(stem.en.stemWord(word),
               stem.ru.stemWord(word),
               key=lambda x: len(x))


stem.ru = stemmer('russian')
stem.en = stemmer('english')
from titlecase import titlecase
from unidecode import unidecode
from yarl import URL

env = Env(GITHUB_USERNAME=str, GITHUB_TOKEN=str, PINBOARD_TOKEN=str)

GITHUB_TOKEN = env.str("GITHUB_TOKEN")
GITHUB_USERNAME = env.str("GITHUB_USERNAME")
PINBOARD_TOKEN = env.str("PINBOARD_TOKEN")

IGNORE_WORDS = set(
    [word.lower() for word in Path("IGNORE_WORDS.txt").read_text().split()])

STOP_WORDS = set(
    [word.lower() for word in Path("STOP_WORDS.txt").read_text().split()])
STOP_WORDS.update(set(safe_get_stop_words("english")))

IGNORE_TAGS = IGNORE_WORDS | STOP_WORDS


def get_dev_to_info_for_url(url):
    try:
        req = requests.get(url, timeout=1.0)
        soup = BeautifulSoup(req.text, "html.parser")
        data = {
            "tags": [
                tag.text.lstrip("#")
                for tag in soup.find_all("a", {"class": "tag"})
            ]
        }
        return data
Beispiel #32
0
    def calculate_results(self, segment=None):

        org = self.poll.org
        open_ended = self.is_open_ended()
        responded = self.get_responded()
        polled = self.get_polled()

        results = []

        if open_ended and not segment:
            custom_sql = """
                      SELECT w.label, count(*) AS count FROM (SELECT regexp_split_to_table(LOWER(text), E'[^[:alnum:]_]') AS label FROM polls_pollresult WHERE polls_pollresult.org_id = %d AND polls_pollresult.flow = '%s' AND polls_pollresult.ruleset = '%s' AND polls_pollresult.text IS NOT NULL AND polls_pollresult.text NOT ILIKE '%s') w group by w.label;
                      """ % (org.id, self.poll.flow_uuid, self.ruleset_uuid,
                             "http%")
            with connection.cursor() as cursor:
                cursor.execute(custom_sql)
                from ureport.utils import get_dict_from_cursor
                unclean_categories = get_dict_from_cursor(cursor)

            ureport_languages = getattr(settings, 'LANGUAGES',
                                        [('en', 'English')])

            org_languages = [
                lang[1].lower() for lang in ureport_languages
                if lang[0] == org.language
            ]

            if 'english' not in org_languages:
                org_languages.append('english')

            ignore_words = []
            for lang in org_languages:
                ignore_words += safe_get_stop_words(lang)

            categories = []

            # sort by count, then alphabetically
            unclean_categories = sorted(unclean_categories,
                                        key=lambda c:
                                        (-c['count'], c['label']))

            for category in unclean_categories:
                if len(category['label']) > 1 and category[
                        'label'] not in ignore_words and len(categories) < 100:
                    categories.append(
                        dict(label=category['label'],
                             count=int(category['count'])))

            results.append(
                dict(open_ended=open_ended,
                     set=responded,
                     unset=polled - responded,
                     categories=categories))

        else:
            categories_label = self.response_categories.filter(
                is_active=True).values_list('category', flat=True)
            question_results = self.get_question_results()

            if segment:

                location_part = segment.get('location', '').lower()
                age_part = segment.get('age', '').lower()
                gender_part = segment.get('gender', '').lower()

                if location_part in ['state', 'district', 'ward']:

                    location_boundaries = org.get_segment_org_boundaries(
                        segment)

                    for boundary in location_boundaries:
                        categories = []
                        osm_id = boundary.get('osm_id').upper()
                        set_count = 0
                        unset_count_key = "ruleset:%s:nocategory:%s:%s" % (
                            self.ruleset_uuid, location_part, osm_id)
                        unset_count = question_results.get(unset_count_key, 0)

                        for categorie_label in categories_label:
                            if categorie_label.lower(
                            ) not in PollResponseCategory.IGNORED_CATEGORY_RULES:
                                category_count_key = "ruleset:%s:category:%s:%s:%s" % (
                                    self.ruleset_uuid, categorie_label.lower(),
                                    location_part, osm_id)
                                category_count = question_results.get(
                                    category_count_key, 0)
                                set_count += category_count
                                categories.append(
                                    dict(count=category_count,
                                         label=categorie_label))

                        results.append(
                            dict(open_ended=open_ended,
                                 set=set_count,
                                 unset=unset_count,
                                 boundary=osm_id,
                                 label=boundary.get('name'),
                                 categories=categories))
                elif age_part:
                    poll_year = self.poll.poll_date.year

                    born_results = {
                        k: v
                        for k, v in question_results.iteritems()
                        if k[-9:-5] == 'born'
                    }

                    age_intervals = dict()
                    age_intervals['35+'] = (35, 2000)
                    age_intervals['31-34'] = (31, 34)
                    age_intervals['25-30'] = (25, 30)
                    age_intervals['20-24'] = (20, 24)
                    age_intervals['15-19'] = (15, 19)
                    age_intervals['0-14'] = (0, 14)

                    for age_group in age_intervals.keys():
                        lower_bound, upper_bound = age_intervals[age_group]
                        unset_count = 0

                        categories_count = dict()
                        for categorie_label in categories_label:
                            if categorie_label.lower(
                            ) not in PollResponseCategory.IGNORED_CATEGORY_RULES:
                                categories_count[categorie_label.lower()] = 0

                        for result_key, result_count in born_results.iteritems(
                        ):
                            age = poll_year - int(result_key[-4:])

                            if lower_bound <= age < upper_bound:
                                if 'nocategory' in result_key:
                                    unset_count += result_count

                                for categorie_label in categories_label:
                                    if categorie_label.lower(
                                    ) not in PollResponseCategory.IGNORED_CATEGORY_RULES:
                                        if result_key.startswith(
                                                'ruleset:%s:category:%s:' %
                                            (self.ruleset_uuid,
                                             categorie_label.lower())):
                                            categories_count[
                                                categorie_label.lower(
                                                )] += result_count

                        categories = [
                            dict(count=v, label=k)
                            for k, v in categories_count.iteritems()
                        ]

                        set_count = sum([elt['count'] for elt in categories])

                        results.append(
                            dict(set=set_count,
                                 unset=unset_count,
                                 label=age_group,
                                 categories=categories))

                    results = sorted(results, key=lambda i: i['label'])

                elif gender_part:

                    genders = ['f', 'm']
                    gender_labels = dict(f=_('Female'), m=_('Male'))

                    for gender in genders:
                        categories = []
                        set_count = 0
                        unset_count_key = "ruleset:%s:nocategory:%s:%s" % (
                            self.ruleset_uuid, 'gender', gender)
                        unset_count = question_results.get(unset_count_key, 0)

                        for categorie_label in categories_label:
                            category_count_key = "ruleset:%s:category:%s:%s:%s" % (
                                self.ruleset_uuid, categorie_label.lower(),
                                'gender', gender)
                            if categorie_label.lower(
                            ) not in PollResponseCategory.IGNORED_CATEGORY_RULES:
                                category_count = question_results.get(
                                    category_count_key, 0)
                                set_count += category_count
                                categories.append(
                                    dict(count=category_count,
                                         label=categorie_label))

                        results.append(
                            dict(set=set_count,
                                 unset=unset_count,
                                 label=gender_labels.get(gender),
                                 categories=categories))

            else:
                categories = []
                for categorie_label in categories_label:
                    category_count_key = "ruleset:%s:category:%s" % (
                        self.ruleset_uuid, categorie_label.lower())
                    if categorie_label.lower(
                    ) not in PollResponseCategory.IGNORED_CATEGORY_RULES:
                        category_count = question_results.get(
                            category_count_key, 0)
                        categories.append(
                            dict(count=category_count, label=categorie_label))

                results.append(
                    dict(open_ended=open_ended,
                         set=responded,
                         unset=polled - responded,
                         categories=categories))

        cache_time = PollQuestion.POLL_QUESTION_RESULTS_CACHE_TIMEOUT
        if not segment:
            cache_time = None

        if segment and segment.get('location', '').lower() == 'state':
            cache_time = None

        if segment and segment.get('age', '').lower() == 'age':
            cache_time = None

        if segment and segment.get('gender', '').lower() == 'gender':
            cache_time = None

        key = PollQuestion.POLL_QUESTION_RESULTS_CACHE_KEY % (
            self.poll.org.pk, self.poll.pk, self.pk)
        if segment:
            substituted_segment = self.poll.org.substitute_segment(segment)
            key += ":" + slugify(unicode(json.dumps(substituted_segment)))

        cache.set(key, {"results": results}, cache_time)

        return results
Beispiel #33
0
    def get_value_summary(cls,
                          ruleset=None,
                          contact_field=None,
                          filters=None,
                          segment=None):
        """
        Returns the results for the passed in ruleset or contact field given the passed in filters and segments.

        Filters are expected in the following formats:
            { field: rulesetId, categories: ["Red", "Blue", "Yellow"] }

        Segments are expected in these formats instead:
            { ruleset: 1515, categories: ["Red", "Blue"] }  // segmenting by another field, for those categories
            { groups: 124,151,151 }                         // segment by each each group in the passed in ids
            { location: "State", parent: null }             // segment for each admin boundary within the parent
            { contact_field: "Country", values: ["US", "EN", "RW"] } // segment by a contact field for these values
        """
        from temba.contacts.models import ContactGroup, ContactField
        from temba.flows.models import TrueTest, RuleSet

        # start = time.time()
        results = []

        if (not ruleset and not contact_field) or (
                ruleset and contact_field):  # pragma: needs cover
            raise ValueError("Must specify either a RuleSet or Contact field.")

        org = ruleset.flow.org if ruleset else contact_field.org

        open_ended = ruleset and ruleset.ruleset_type == RuleSet.TYPE_WAIT_MESSAGE and len(
            ruleset.get_rules()) == 1

        # default our filters to an empty list if None are passed in
        if filters is None:
            filters = []

        # build the kwargs for our subcall
        kwargs = dict(ruleset=ruleset,
                      contact_field=contact_field,
                      filters=filters)

        # this is our list of dependencies, that is things that will blow away our results
        dependencies = set()
        fingerprint_dict = dict(filters=filters, segment=segment)
        if ruleset:
            fingerprint_dict['ruleset'] = ruleset.id
            dependencies.add(RULESET_KEY % ruleset.id)
        if contact_field:
            fingerprint_dict['contact_field'] = contact_field.id
            dependencies.add(CONTACT_KEY % contact_field.id)

        for contact_filter in filters:
            if 'ruleset' in contact_filter:
                dependencies.add(RULESET_KEY % contact_filter['ruleset'])
            if 'groups' in contact_filter:
                for group_id in contact_filter['groups']:
                    dependencies.add(GROUP_KEY % group_id)
            if 'location' in contact_filter:  # pragma: needs cover
                field = ContactField.get_by_label(org,
                                                  contact_filter['location'])
                dependencies.add(CONTACT_KEY % field.id)

        if segment:
            if 'ruleset' in segment:
                dependencies.add(RULESET_KEY % segment['ruleset'])
            if 'groups' in segment:  # pragma: needs cover
                for group_id in segment['groups']:
                    dependencies.add(GROUP_KEY % group_id)
            if 'location' in segment:
                field = ContactField.get_by_label(org, segment['location'])
                dependencies.add(CONTACT_KEY % field.id)

        # our final redis key will contain each dependency as well as a HASH representing the fingerprint of the
        # kwargs passed to this method, generate that hash
        fingerprint = hash(dict_to_json(fingerprint_dict))

        # generate our key
        key = VALUE_SUMMARY_CACHE_KEY + ":" + str(org.id) + ":".join(
            sorted(list(dependencies))) + ":" + str(fingerprint)

        # does our value exist?
        r = get_redis_connection()
        cached = r.get(key)

        if cached is not None:
            try:
                return json_to_dict(cached)
            except Exception:  # pragma: needs cover
                # failed decoding, oh well, go calculate it instead
                pass

        if segment:
            # segmenting a result is the same as calculating the result with the addition of each
            # category as a filter so we expand upon the passed in filters to do this
            if 'ruleset' in segment and 'categories' in segment:
                for category in segment['categories']:
                    category_filter = list(filters)
                    category_filter.append(
                        dict(ruleset=segment['ruleset'],
                             categories=[category]))

                    # calculate our results for this segment
                    kwargs['filters'] = category_filter
                    (set_count, unset_count,
                     categories) = cls.get_filtered_value_summary(**kwargs)
                    results.append(
                        dict(label=category,
                             open_ended=open_ended,
                             set=set_count,
                             unset=unset_count,
                             categories=categories))

            # segmenting by groups instead, same principle but we add group filters
            elif 'groups' in segment:  # pragma: needs cover
                for group_id in segment['groups']:
                    # load our group
                    group = ContactGroup.user_groups.get(org=org, pk=group_id)

                    category_filter = list(filters)
                    category_filter.append(dict(groups=[group_id]))

                    # calculate our results for this segment
                    kwargs['filters'] = category_filter
                    (set_count, unset_count,
                     categories) = cls.get_filtered_value_summary(**kwargs)
                    results.append(
                        dict(label=group.name,
                             open_ended=open_ended,
                             set=set_count,
                             unset_count=unset_count,
                             categories=categories))

            # segmenting by a contact field, only for passed in categories
            elif 'contact_field' in segment and 'values' in segment:
                # look up the contact field
                field = ContactField.get_by_label(org,
                                                  segment['contact_field'])

                for value in segment['values']:
                    value_filter = list(filters)
                    value_filter.append(
                        dict(contact_field=field.pk, values=[value]))

                    # calculate our results for this segment
                    kwargs['filters'] = value_filter
                    (set_count, unset_count,
                     categories) = cls.get_filtered_value_summary(**kwargs)
                    results.append(
                        dict(label=value,
                             open_ended=open_ended,
                             set=set_count,
                             unset=unset_count,
                             categories=categories))

            # segmenting by a location field
            elif 'location' in segment:
                # look up the contact field
                field = ContactField.get_by_label(org, segment['location'])

                # make sure they are segmenting on a location type that makes sense
                if field.value_type not in [
                        Value.TYPE_STATE, Value.TYPE_DISTRICT, Value.TYPE_WARD
                ]:  # pragma: needs cover
                    raise ValueError(
                        _("Cannot segment on location for field that is not a State or District type"
                          ))

                # make sure our org has a country for location based responses
                if not org.country:  # pragma: needs cover
                    raise ValueError(
                        _("Cannot segment by location until country has been selected for organization"
                          ))

                # the boundaries we will segment by
                parent = org.country

                # figure out our parent
                parent_osm_id = segment.get('parent', None)
                if parent_osm_id:
                    parent = AdminBoundary.objects.get(osm_id=parent_osm_id)

                # get all the boundaries we are segmenting on
                boundaries = list(
                    AdminBoundary.objects.filter(
                        parent=parent).order_by('name'))

                # if the field is a district field, they need to specify the parent state
                if not parent_osm_id and field.value_type == Value.TYPE_DISTRICT:  # pragma: needs cover
                    raise ValueError(
                        _("You must specify a parent state to segment results by district"
                          ))

                if not parent_osm_id and field.value_type == Value.TYPE_WARD:  # pragma: needs cover
                    raise ValueError(
                        _("You must specify a parent state to segment results by ward"
                          ))

                # if this is a district, we can speed things up by only including those districts in our parent, build
                # the filter for that
                if parent and field.value_type in [
                        Value.TYPE_DISTRICT, Value.TYPE_WARD
                ]:
                    location_filters = [
                        filters,
                        dict(location=field.pk,
                             boundary=[b.osm_id for b in boundaries])
                    ]
                else:
                    location_filters = filters

                # get all the contacts segment by location first
                (location_set_contacts, location_unset_contacts, location_results) = \
                    cls.get_filtered_value_summary(contact_field=field, filters=location_filters, return_contacts=True)

                # now get the contacts for our primary query
                kwargs['return_contacts'] = True
                kwargs['filter_contacts'] = location_set_contacts
                (primary_set_contacts, primary_unset_contacts,
                 primary_results) = cls.get_filtered_value_summary(**kwargs)

                # build a map of osm_id to location_result
                osm_results = {lr['label']: lr for lr in location_results}
                empty_result = dict(contacts=list())

                for boundary in boundaries:
                    location_result = osm_results.get(boundary.osm_id,
                                                      empty_result)

                    # clone our primary results
                    segmented_results = dict(label=boundary.name,
                                             boundary=boundary.osm_id,
                                             open_ended=open_ended)

                    location_categories = list()
                    location_contacts = set(location_result['contacts'])

                    for category in primary_results:
                        category_contacts = set(category['contacts'])

                        intersection = location_contacts & category_contacts
                        location_categories.append(
                            dict(label=category['label'],
                                 count=len(intersection)))

                    segmented_results['set'] = len(location_contacts
                                                   & primary_set_contacts)
                    segmented_results['unset'] = len(location_contacts
                                                     & primary_unset_contacts)
                    segmented_results['categories'] = location_categories
                    results.append(segmented_results)

                results = sorted(results, key=lambda r: r['label'])

        else:
            (set_count, unset_count,
             categories) = cls.get_filtered_value_summary(**kwargs)

            # Check we have and we have an OPEN ENDED ruleset
            if ruleset and len(ruleset.get_rules()) == 1 and isinstance(
                    ruleset.get_rules()[0].test, TrueTest):
                cursor = connection.cursor()

                custom_sql = """SELECT w.label, count(*) AS count FROM (
                    SELECT
                      regexp_split_to_table(LOWER(text), E'[^[:alnum:]_]') AS label
                    FROM msgs_msg INNER JOIN contacts_contact ON ( msgs_msg.contact_id = contacts_contact.id )
                    WHERE msgs_msg.id IN (
                      SELECT
                        msg_id
                        FROM flows_flowstep_messages, flows_flowstep
                        WHERE flowstep_id = flows_flowstep.id AND
                        flows_flowstep.step_uuid = '%s'
                      ) AND contacts_contact.is_test = False
                  ) w group by w.label order by count desc;""" % ruleset.uuid

                cursor.execute(custom_sql)
                unclean_categories = get_dict_from_cursor(cursor)
                categories = []

                org_languages = [
                    lang.name.lower()
                    for lang in org.languages.filter(orgs=None).distinct()
                ]

                if 'english' not in org_languages:
                    org_languages.append('english')

                ignore_words = []
                for lang in org_languages:
                    ignore_words += safe_get_stop_words(lang)

                for category in unclean_categories:
                    if len(category['label']) > 1 and category[
                            'label'] not in ignore_words and len(
                                categories) < 100:
                        categories.append(
                            dict(label=category['label'],
                                 count=int(category['count'])))

                # sort by count, then alphabetically
                categories = sorted(categories,
                                    key=lambda c: (-c['count'], c['label']))

            results.append(
                dict(label=six.text_type(_("All")),
                     open_ended=open_ended,
                     set=set_count,
                     unset=unset_count,
                     categories=categories))

        # for each of our dependencies, add our key as something that depends on it
        pipe = r.pipeline()
        for dependency in dependencies:
            pipe.sadd(dependency, key)
            pipe.expire(dependency, VALUE_SUMMARY_CACHE_TIME)

        # and finally set our result
        pipe.set(key, dict_to_json(results), VALUE_SUMMARY_CACHE_TIME)
        pipe.execute()

        # leave me: nice for profiling..
        # from django.db import connection as db_connection, reset_queries
        # print "=" * 80
        # for query in db_connection.queries:
        #    print "%s - %s" % (query['time'], query['sql'][:1000])
        # print "-" * 80
        # print "took: %f" % (time.time() - start)
        # print "=" * 80
        # reset_queries()

        return results
Beispiel #34
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import *

import gensim
from collections import defaultdict
from stop_words import safe_get_stop_words

from sklearn.model_selection import train_test_split

from termcolor import colored

stop_words = safe_get_stop_words('russian')

data = pd.read_excel('reviews.xlsx')
data.columns = ['rate', 'text']

data.text = data.text.str.lower()

data.rate.replace(to_replace=[1, 2], value=-1, inplace=True)
data.rate.replace(to_replace=[3], value=0, inplace=True)
data.rate.replace(to_replace=[4, 5], value=1, inplace=True)

y = data.rate
x = data.text.values

cvec = CountVectorizer(stop_words=stop_words)
x_cvec = cvec.fit_transform(x)
Beispiel #35
0
if args.w is None:
    args.w = "english"

if args.min is None:
    args.min = 0

if args.boost is None:
    args.boost = 1

if args.blow is None:
    args.blow = 1

stopwords = set(STOPWORDS)
if args.l is not None:
    for language in args.l:
        stopwords.update(safe_get_stop_words(language.lower()))

mask = None
colors = None
if args.m is not None:
    print("Creating mask...", end=" ", flush=True)
    mask = np.array(Image.open(args.m).convert("RGB"))
    colors = ImageColorGenerator(mask)
    print("Done!")

cw = 0
cc = None
if args.c is not None:
    cw = int(args.c[0])
    cc = args.c[1]
 def remove_stop_words(string, language):
     tokens = string.split()
     clean_tokens = [token for token in tokens if token not in safe_get_stop_words(language)]
     return u' '.join(clean_tokens)