コード例 #1
0
ファイル: features.py プロジェクト: iamsimha/hdepot-relevancy
def basic_features():

    # Spelling corrector
    df_all['search_term'] = df_all['search_term'].apply(lambda x : spellingCorrector.correctSpelling(x))
    # Remove stop words
    df_all['product_description'] = df_all['product_description'].apply(lambda x: utils.remove_stop_words(x))
    df_all['product_title'] = df_all['product_title'].apply(lambda x: utils.remove_stop_words(x))
    df_all['search_term'] = df_all['search_term'].apply(lambda x: utils.remove_stop_words(x))

    # Stem words
    df_all['product_description'] = df_all['product_description'].apply(lambda x: utils.str_stem(x))
コード例 #2
0
def similarity_score(t, s):
    """Returns a similarity score for a given sentence.

    similarity score = the total number of tokens in a sentence that exits
                        within the title / total words in title

    """
    t = utils.remove_stop_words(t.lower())
    s = utils.remove_stop_words(s.lower())
    t_tokens, s_tokens = t.split(), s.split()
    similar = [w for w in s_tokens if w in t_tokens]
    if len(t_tokens) == 0:
        return 0
    score = (len(similar) * 0.1) / float(len(t_tokens))
    return score
コード例 #3
0
    def compute_word_to_weight_helper(self, word_to_weight, line, section_index):
        line = re.sub('[^0-9a-zA-Z]+', ' ', line)
        line = utils.remove_stop_words(line)
        if self.section_weights[section_index] == 0:
          return
        word_weight = self.section_weights[section_index]
        
        tokens = nltk.word_tokenize(line)
        words = [0] * len(tokens)
        stemmer = nltk.stem.lancaster.LancasterStemmer()
        for i in range(len(tokens)):
            words[i] = stemmer.stem(tokens[i])

        # Also add 2 and 3 sized word grams into the features
        for index, word in enumerate(words):
            for new_index in xrange(index, index + self.max_word_gram_size):
                # Ensure words are not out of range
                if (len(words) <= new_index):
                    continue
                word_gram = words[index : new_index + 1]
                key = ' '.join(word_gram)
                if self.sectionalize:
                    key = str(section_index) + '_' + key
                if key in word_to_weight:
                    word_to_weight[key] += word_weight
                else:
                    word_to_weight[key] = word_weight
コード例 #4
0
def get_cleaned_docs_from_db():
    documents = session.query(Document).all()
    raw_filenames = [doc.to_dict()['raw_filename'] for doc in documents]

    cleaned_docs = []
    for raw_filename in raw_filenames:
        k = Key(b)
        k.key = 'documents/{}'.format(raw_filename)
        try:
            raw = k.get_contents_as_string()
            soup = BeautifulSoup(raw, 'html.parser')

            title, doc = get_content(soup)
            print title
            doc = clean_document(doc)
            cleaned_doc = utils.remove_stop_words(doc)
            cleaned_docs.append(cleaned_doc)
        except Exception as e:
            print e

    vsm = VSMapping()
    vsm.batch_fit(cleaned_docs)
    transformations = [vsm.transform(doc) for doc in cleaned_docs]
    for i, (vid, vector) in enumerate(transformations):
        # Update Document vector_id
        document_id = documents[i].to_dict()['id']
        print document_id, vid
        session.query(Document).filter(Document.id == document_id).update({'vector_id': vid})
        session.commit()

    print len(cleaned_docs)
コード例 #5
0
def upvote():
	try:
		df = utils.read_data(FILE)
		if len(request.form["search_type"]) > 0 and isinstance(request.form["search_type"], str) == True and str(request.form["search_type"]) in ["label_choice", "name_choice", "research_choice"]:
			search_type = request.form["search_type"]
			if search_type == "label_choice":
				send_msg = request.form["wine_name"]
				answer = str(send_msg)
			elif search_type == "name_choice":
				answer = df.loc[df['name'] == str(request.form["wine_name"])]
				answer = answer[['name', 'image', 'pays','region', 'appelation', 'domaine', 'millesime', 'couleur','description']].head(10)
			elif search_type == "research_choice":
				couleur = str(request.form["couleur"])
				pays = str(request.form["pays"])
				want = str(request.form["want"])
				want = utils.remove_stop_words(want, STOP)
				want = str(utils.process_request(want))
				answer = utils.generate_research_choice(df, couleur, pays)
				if len(want) > 0:
					answer = utils.treat_input(answer,want,'clean_text')
				else:
					pass
			else:
				pass
			return utils.generate_answer(answer)

		else:
			return False
	except:
		return False
コード例 #6
0
ファイル: doc_collector.py プロジェクト: acrosson/braise
def get_cleaned_docs_from_db():
    documents = session.query(Document).all()
    raw_filenames = [doc.to_dict()['raw_filename'] for doc in documents]

    cleaned_docs = []
    for raw_filename in raw_filenames:
        k = Key(b)
        k.key = 'documents/{}'.format(raw_filename)
        try:
            raw = k.get_contents_as_string()
            soup = BeautifulSoup(raw, 'html.parser')

            title, doc = get_content(soup)
            print title
            doc = clean_document(doc)
            cleaned_doc = utils.remove_stop_words(doc)
            cleaned_docs.append(cleaned_doc)
        except Exception as e:
            print e

    vsm = VSMapping()
    vsm.batch_fit(cleaned_docs)
    transformations = [vsm.transform(doc) for doc in cleaned_docs]
    for i, (vid, vector) in enumerate(transformations):
        # Update Document vector_id
        document_id = documents[i].to_dict()['id']
        print document_id, vid
        session.query(Document).filter(Document.id == document_id).update(
            {'vector_id': vid})
        session.commit()

    print len(cleaned_docs)
コード例 #7
0
def test(X):
    jaccard_scores = []
    for article in X:
        scores = []
        for sentence1, sentence2 in zip(article, article[1:]):
            stopped_sentence1 = utils.remove_stop_words(sentence1)
            stemmed_sentence1 = utils.stem_tokens(stopped_sentence1)
            stopped_sentence2 = utils.remove_stop_words(sentence2)
            stemmed_sentence2 = utils.stem_tokens(stopped_sentence2)

            scores.append(utils.jaccard(stemmed_sentence1, stemmed_sentence2))

        if scores:
            jaccard_scores.append(np.average(scores))
        else:
            jaccard_scores.append(0.1)

    return np.array(jaccard_scores).reshape(len(jaccard_scores), 1)
コード例 #8
0
def show_frecuencia():

    words = request.args.getlist('word')
    date = request.args.get('date')
    accounts = request.args.getlist('account')
    polaridad = request.args.getlist('polaridad')

    tweets = get_filtros(words, date, accounts, polaridad)
    users = get_tweets_with_its_user(tweets)
    n = [{"tweets": t, "user": u} for t, u in zip(tweets, users)]

    words3 = []
    counts = []
    for tweet in n:
        words2 = tweet["tweets"]["text"].split(" ")
        processedWords = utils.remove_stop_words(utils.emoji(utils.signos(utils.https(utils.numbers(words2)))))

        for word in processedWords:
            if word != '':
                if (word.lower() not in words3):
                    words3.append(word.lower())
                    counts.append(1)

                else:
                    index = words3.index(word.lower())
                    counts[index] += 1

    ans = [{"_id": word, "count": count} for word, count in zip(words3, counts)]
    fd = nltk.FreqDist(counts)
    freqq = []
    ans2 = []

    for item in ans:
        num = item['count']
        t = num/np.sum(counts)
        freqq.append(t)

    prom = np.mean(freqq)
    de = np.std(freqq)
    for item in ans:
        if freqq[ans.index(item)] > prom+de and item['_id'] not in words:
            ans2.append(item)

    return {"tweets": n, "data": ans2}
コード例 #9
0
    def __init__(self):
        self.index = CustomInvertedIndex()
        http = urllib3.PoolManager()
        page = 'http://www.imdb.com/search/title?groups=top_1000&sort=user_rating&view=advanced&page='
        rows = []
        for i in range(1, 21):
            url = page + str(i)
            raw_html = http.request('GET', url).data.decode('utf-8')
            soup = BeautifulSoup(raw_html, 'lxml')

            movie_tags = soup.find_all('div', class_='lister-item-content')

            for movie_tag in movie_tags:
                movie = movie_tag.a.string
                title_terms = remove_stop_words(movie.lower().split(' '))
                directors_actors_tag = movie_tag.contents[9]

                directors = []
                actors = []
                currently_filling = directors
                for tag in directors_actors_tag.contents:
                    if tag.name == 'span':
                        currently_filling = actors
                        continue
                    if not isinstance(tag, bs4.element.NavigableString):
                        person = tag.string.lower()
                        currently_filling.append(person)

                genre_tag = movie_tag.contents[3].find(class_='genre')
                genres = genre_tag.string.lstrip('\n').rstrip().lower().split(
                    ', ')

                # time to update the index with the parsed values
                self.index.update(title_terms, movie, Priority.Title.value)
                for actor in actors:
                    self.index.update(actor.split(' '), movie,
                                      Priority.Actor.value)

                for director in directors:
                    self.index.update(director.split(' '), movie,
                                      Priority.Director.value)

                self.index.update(genres, movie, Priority.Genre.value)
コード例 #10
0
def question_c():
    logging.info(
        "<Question C> Getting the significance and TFxICF representation")
    all_categories = train_full_set.target_names

    all_docs_per_category = []

    classes_list = [
        train_full_set.target_names.index("comp.sys.ibm.pc.hardware"),
        train_full_set.target_names.index("comp.sys.mac.hardware"),
        train_full_set.target_names.index("misc.forsale"),
        train_full_set.target_names.index("soc.religion.christian")
    ]

    logging.info(
        "Store data from all docs of a certain category as entries in all_data_category"
    )
    for cat in all_categories:
        train_category = utils.fetch_data([cat])[0]
        data_category = train_category.data
        temp = ''
        for doc in data_category:
            temp += ' ' + doc
        all_docs_per_category.append(temp)

    logging.info("Now build frequency tables for each class")

    vectorized_newsgroups_train = utils.remove_stop_words(
        all_docs_per_category)

    print(vectorized_newsgroups_train.shape)

    max_term_freq_per_category = [0] * vectorized_newsgroups_train.shape[0]
    category_count_per_term = [0] * vectorized_newsgroups_train.shape[1]

    for i in range(vectorized_newsgroups_train.shape[0]):
        max_term_freq_per_category[i] = max(
            vectorized_newsgroups_train[i].data)

    category_count_per_term = vectorized_newsgroups_train.sum(axis=0)

    print(max_term_freq_per_category)
    print(category_count_per_term)
コード例 #11
0
ファイル: doc_collector.py プロジェクト: acrosson/braise
def collect(url, vsm):
    print url
    print('Downloading doc...')
    title, doc, raw_filename = download_document(url)

    print('Cleaning doc...')
    doc = clean_document(doc)
    cleaned_doc = utils.remove_stop_words(doc)

    print('Fitting VSM...')
    vsm.partial_fit(cleaned_doc)

    doc_vid, doc_vector = vsm.transform(doc)

    print('Summarize...')
    summary = summarize(title, doc, cleaned_doc, doc_vector, vsm.feature_names)

    save_document(title, summary, url, raw_filename, doc_vid)

    print 'done'
コード例 #12
0
 def __street_sweep(self, streetFlags, stopWords):
     """Takes a list of common street flags (road, street,...) and a list of 
     phrasing words to remove from the address (into, the on, ...) and 
     searches for possible addresses within the comment field. The longest 
     address with a street number is then selected and the street and number 
     fields are updated.
     """
                 
     i = 0
     length = 0
     num = False
     punc_comment = remove_punc(self.comment)
     while i < len(streetFlags):
         f_num = False
         punc_flag = remove_punc(streetFlags[i]) 
         pat = re.compile(r"(\d{1,3}[a-zA-Z]?)?\s?([a-zA-Z]*\s?[a-zA-Z]+\s)"+
                         r"("+punc_flag+r")(,|\s|.|$)",re.IGNORECASE)
         match = re.search(pat,punc_comment)
         if match:
             if match.group(2):
                 if match.group(1):
                     f_num = True
                 templine = remove_stop_words(match.group(2),stopWords)
                 if templine:
                     templine = templine.strip(' ')
                     f_len = len(templine.split(' '))
                 else:
                     f_len = 0
                 #Larger addresses are favoured and street numbers are 
                 # favoured above all else
                 if (f_len > length and num == f_num) or (f_num and not num)\
                     and f_len > 0:
                     self.road = templine +' '+match.group(3) 
                     num = f_num
                     length = f_len
                     if num:
                         self.number = match.group(1)
         i+=1
コード例 #13
0
ファイル: passage_indexer.py プロジェクト: lichguard/APR
    def execute_query(self, query):
        start = time.time()
        query_tokens = process_and_tokenize_string(query)
        unprocessed_query_tokens = split_strings(query)
        self.logger.info(" Executing Query: '" + str(query) +
                         "'  ---- tokens:" + str(query_tokens))
        top_docs = [TopPassage(doc) for doc in self.docs]
        question_class = -1
        for i, wh in enumerate(wh_questions):
            if wh in unprocessed_query_tokens:
                question_class = i
                break

        #pos_list = nltk.pos_tag(unprocessed_query_tokens)
        tokens_synonyms = []
        for token in remove_stop_words(unprocessed_query_tokens):
            tokens_synonyms += get_processed_synonyms(token)

        #print(tokens_synonyms)

        ngrams_vector = self.ngrams.query(query_tokens, self.docs)
        expanded_ngram_vector = self.ngrams.query(tokens_synonyms, self.docs)

        for i in range(len(top_docs)):
            progbar(i, len(top_docs))
            top_docs[i].update_score(ScoreType.ngram, ngrams_vector[i])
            top_docs[i].update_score(ScoreType.expanded_ngram,
                                     expanded_ngram_vector[i])

            top_docs[i].calculate_score()
        print(' ')
        top_docs.sort(key=lambda x: x.score, reverse=True)

        end = time.time()
        self.logger.info("execute_query complete. elapsed time: " +
                         str(end - start) + " secs")
        return top_docs
コード例 #14
0
def collect(url, vsm):
    print url
    print('Downloading doc...') 
    title, doc, raw_filename = download_document(url)

    print('Cleaning doc...')
    doc = clean_document(doc)
    cleaned_doc = utils.remove_stop_words(doc)

    print('Fitting VSM...')
    vsm.partial_fit(cleaned_doc)

    doc_vid, doc_vector = vsm.transform(doc)

    print('Summarize...')
    summary = summarize(title, doc, cleaned_doc, doc_vector, vsm.feature_names)

    save_document(title,
                  summary,
                  url,
                  raw_filename,
                  doc_vid)

    print 'done'
コード例 #15
0
    def init(self, title, snippets):
        with open(title, "rb") as f:
            p = pickle.load(f)
            self.titles_count = len(p)

            # 打乱数据
            sidx = np.random.permutation(self.titles_count)
            shffled_original_data = [(p[i][0], p[i][1], p[i][2]) for i in sidx]

            for _ in shffled_original_data:
                self.titles_raw.append(_[0].strip().split())
                self.titles_tok.append(
                    utils.remove_stop_words(_[1].strip().split()))
                self.titles_id.append(_[2])

        self.titles_average_len = sum(
            [len(self.titles_tok) + 0.0
             for doc in self.titles_tok]) / self.titles_count

        with open(snippets, "rb") as f:
            self.id_snippets = pickle.load(f)

        for title in self.titles_tok:
            tmp = {}
            for word in title:
                tmp[word] = tmp.get(word, 0) + 1  # 存储每个文档中每个词的出现次数
            self.f.append(tmp)
            self.titles_vec.append(
                utils.sent2matrix(self.w2v, self.embedding_dim, title))
            for k in tmp.keys():
                self.df[k] = self.df.get(k, 0) + 1
        for k, v in self.df.items():
            self.idf[k] = math.log(self.titles_count - v + 0.5) - math.log(v +
                                                                           0.5)
        for title in self.titles_tok:
            self.titles_idf.append(utils.sentence_idf_vector(self.idf, title))
コード例 #16
0
def load_data(args, mode='train'):
    text = []
    usecols = list(
        map(lambda x: int(x),
            args.get('Data', 'usecols').split(',')))
    path = args.get('Data', 'dataset') + '/' + mode + '.csv'
    print('\n path: ' + path)
    data = pd.read_csv(path,
                       usecols=usecols,
                       encoding=args['Data'].get('encoding'),
                       sep=args['Data'].get('csv_sep'),
                       doublequote=True)
    labels = data.iloc[:, 0].tolist()
    if args.get('Data', 'dataset') == 'yelp':
        text = data.iloc[:, 1].tolist()
    elif args.get('Data', 'dataset') == 'ag_news':
        text = (data.iloc[:, 1] + ' ' + data.iloc[:, 2]).tolist()

    text = [remove_stop_words(t) for t in text]
    text = [remove_special_characters(t) for t in text]
    text = [remove_numbers(t) for t in text]
    text = [remove_single_letters(t) for t in text]
    text = [remove_single_characters(t) for t in text]
    return labels, text
コード例 #17
0
ファイル: Driver.py プロジェクト: IvanIZ/SpeechRec
def extract_key_words(file_name):
    """
    Args:
        file_name: an mov file where the key words will be extracted

    Returns: a list of phrases that are extracted from the input video file. These are the extracted keywords

    """

    # cut the lecture videos into different scenes
    scenes = VideoToText.find_scenes(file_name,
                                     min_scene_length=1,
                                     abs_min=0.75,
                                     abs_max=0.98,
                                     find_subscenes=True,
                                     max_subscenes_per_minute=12)

    # for each scene, get its dictionary
    scene_text_dict, frequent_patterns = VideoToText.scene_to_text(scenes)
    print(scene_text_dict)
    print(
        "Raw Dictionary =================================================================================="
    )

    # combine the multiple dictionary into one
    phraseDict = convert_dict_to_phraseDict(scene_text_dict)
    print(phraseDict)
    print(
        "Combined raw dictionary =================================================================================="
    )

    # clean up the each dictionary by removing non-letters and non-numbers
    phraseDict = process_phraseDict(phraseDict)
    print(phraseDict)
    print(
        "cleaned dictionaries =================================================================================="
    )

    # process the dictionary by comparing frequency with the brown corpus
    phraseList = utils.process_by_frequency(phraseDict)
    print(phraseList)
    print(
        "initial phrase list =================================================================================="
    )

    # process the phrase list by removing stop words
    phraseList = utils.remove_stop_words(phraseList)
    print(phraseList)
    print(
        "Final list after removing stop words ===================================================================="
    )
    print()

    # extract frequent patterns and put them into a list
    print(frequent_patterns)
    print(
        "frequent patterns ==============================================================================="
    )
    print()
    frequent_patterns_list = utils.patterns_to_list(frequent_patterns)

    # combine the frequent patterns with single phrase list into one list
    phraseList = phraseList + frequent_patterns_list
    print(phraseList)
    print(
        "Final phrase list ==============================================================================="
    )
    print()

    return phraseList
コード例 #18
0
def show_grafo():

    words = request.args.getlist('word')
    date = request.args.get('date')
    accounts = request.args.getlist('account')
    polaridad = request.args.getlist('polaridad')

    tweets = get_filtros(words, date, accounts, polaridad)

    tweets_users = get_tweets_with_its_user(tweets)
    n = [{"tweets": t, "user": u} for t, u in zip(tweets, tweets_users)]

    # usuarios de los tweets
    userIds = []
    for tweet in tweets:
        userId = tweet['userId']
        if userId not in userIds:
            userIds.append(userId)

    users = []

    for userId in userIds:
        userInfo = searchUserId(userId)
        if len(userInfo) > 0:
            userTweets = []
            for tweet in tweets:
                if userId == tweet['userId']:
                    userTweets.append(tweet['text'])
            users.append({'user': '******' + userInfo[0]['screen_name'], 'tweets': userTweets})

    users2 = []

    for user in users:
        filtered = []
        for tweet in user['tweets']:
            words_text = list(tweet.split(" "))
            filtered.append(utils.remove_stop_words(words_text))
            palabras = filtered1(filtered)
            sin_num = utils.numbers(palabras)
            sin_imag = utils.https(sin_num)
            sin_puntos = utils.signos(sin_imag)
            sin_emojis = utils.emoji(sin_puntos)
        users2.append({'name': user['user'], 'words': sin_emojis})

    users3 = []
    for i in range(len(users2)):
        user1 = users2[i]
        words = []
        for j in range(len(users2)):
            user2 = users2[j]
            if i != j:
                for word in user1['words']:
                    for word2 in user2['words']:
                        if word.lower() == word2.lower() and word.lower() not in words and word != "":
                            words.append(word.lower())

        users3.append({'name': user1['name'], 'words': words})



    return {'tweets': n, 'data': users3}