def count_word_occurance():
    if request.method == 'POST':
        session_id = request.args.get("session_id")
        word = request.form['word']
        word_stem = indeed_scrape.stemmer.stem(word)

        sess_dict = get_sess(session_id)
        df = load_csv(session_id)

        count = 0
        for doc in df['summary_stem']:
            tokens = np.array(indeed_scrape.toker(doc))
            test = np.where(tokens == word_stem)[0].shape[0]
            if test > 0:
                count += 1

        number = df.shape[0]
        percent = 100 * np.round(float(count) / number, decimals=1)
        percent = str(percent)

        session_string = "?session_id=%s" % session_id

        return render_template('word_count.html',
                count=count,
                session_id=session_string,
                word=word,
                number=number,
                percent=percent)

    if request.method == 'GET':
        session_id = request.args.get("session_id")
        session_string = "?session_id=%s" % session_id
        return render_template('word_count.html', session_id=session_string)
Beispiel #2
0
    def normalize_titles(self, titles):
        ind = indeed_scrape.Indeed("kw")

        titles = map(lambda x: x.replace("/", " "), titles)
        titles = map(lambda x: x.replace("-", " "), titles)
        titles = map(lambda x: re.sub("^\s+", "", x), titles)
        titles = map(lambda x: re.sub("$\s+", "", x), titles)
        titles = map(lambda x: re.sub("\s{2,}", " ", x), titles)

        out = []
        for title in titles:
            #temp_list = ind._decode(title)
            #if temp_list is None:
                #continue
            temp_list = indeed_scrape.toker(title)
            temp_list = ind.len_tester(temp_list)
            temp_string = " ".join(temp_list)
            temp_string = re.sub('\s+amp\s+', ' ', temp_string)
            temp_string = re.sub('\s+and\s+', ' ', temp_string)
            out.append(temp_string)

        out = map(lambda x: x.lower(), out)

        return out