コード例 #1
0
def autotag(inp):

    if (len(inp) < 1):
        print "Please specify correct filename(s)"
        return

    rake_object = rake.Rake("RAKE/SmartStoplist.txt", 3, 3, 1)
    files = inp
    for file in files:
        if (os.path.isfile(file)):
            fname, extension = os.path.splitext(file)
            extension = extension[1:].strip()
            if (extension != ''):
                if (extension not in tag_vs_files):
                    tag_vs_files[extension] = []
                if (file not in tag_vs_files[extension]):
                    tag_vs_files[extension].append(file)
            file = os.path.abspath(file)
            if (file not in tag_vs_files[extension]):
                tag_vs_files[extension].append(file)
            print "Added: ", file, "with tag: ", extension

            #RAKE keywords generation
            print "Adding Text analytic tags. . ."

            file_handle = open(file, 'r')
            keywords_ratings = rake_object(file_handle.read())[:50]
            for tag, rating in keywords_ratings:
                if tag not in tag_vs_files:
                    tag_vs_files[tag] = []
                if file not in tag_vs_files[tag]:
                    tag_vs_files[tag].append(file)
                print "Added: ", file, "with tag: ", tag

        elif (os.path.exists(file)):
            for dirname, dirnames, filenames in os.walk(file):
                for filename in filenames:
                    fname, extension = os.path.splitext(filename)
                    filepath = os.path.join(dirname, filename)
                    #print "~",dirname, dirnames, filename
                    filepath = os.path.abspath(filepath)
                    extension = extension[1:].strip()
                    if (extension != ''):
                        if (extension not in tag_vs_files):
                            tag_vs_files[extension] = []
                        if (filepath not in tag_vs_files[extension]):
                            tag_vs_files[extension].append(filepath)
                    if (filepath not in tag_vs_files[tag]):
                        tag_vs_files[tag].append(filepath)
                    print "Added: ", filepath, "with tag: ", tag
        else:
            print "No such file:", file
    outputfile = open('tag_vs_files_dictionary.pkl', 'wb')
    cPickle.dump(tag_vs_files, outputfile)
    outputfile.close()
コード例 #2
0
ファイル: advancedMiner.py プロジェクト: Salil999/Tilder
def find_keywords(text):
    """
    This function finds all the keywords.

    Args:
            text (string): Where we extract the keywords from

    Returns:
            keywords (list): A list of tuples that contains the key words
    """
    keywords = rk.Rake("SmartStoplist.txt", 4, 4, 2)
    words = keywords.run(text)
    return words
コード例 #3
0
def generate_heatmap_data_no_db():
    """Generates all heatmap data without touching the database."""
    # Fetch all dates.
    dates = list(hansard.db.query(hansard.Hansard.date).distinct())
    # Fetch all locations.
    with open('name_to_loc.json') as f:
        loc_to_pos = json.load(f)
        locations = sorted(loc_to_pos.keys())
    # Assign each location an ID based on its index.
    loc_to_id = {j: i for i, j in enumerate(locations)}
    # Generate a NumPy array of dates x locations.
    data = numpy.zeros((len(dates), len(locations)))
    # For each date and location, count Hansard items.
    for i, (date, ) in enumerate(dates):
        results = hansard.Hansard.query.filter_by(date=date).all()
        for result in results:
            j = loc_to_id[result.location]
            data[i, j] += result.hits
    # Compute the mean heatmap.
    mean = data.mean(axis=0)
    # Divide out to normalise.
    data /= mean + 1

    # Get the top n peaks for each date.
    all_peaks = numpy.argsort(data, axis=1)[:, -N_PEAKS:]

    for i, (date, ) in enumerate(dates):
        # Have a gander at the top peak.
        peak = all_peaks[i, 0]
        location = locations[peak]
        h = hansard.Hansard.query.filter_by(date=date,
                                            location=location).first()
        text = h.text

        # From github.com/aneesha/RAKE.
        sentence_list = rake.split_sentences(text)
        stop_path = 'RAKE/SmartStoplist.txt'
        stopword_pattern = rake.build_stop_word_regex(stop_path)
        phrase_list = rake.generate_candidate_keywords(sentence_list,
                                                       stopword_pattern)
        word_scores = rake.calculate_word_scores(phrase_list)
        keyword_candidates = rake.generate_candidate_keyword_scores(
            phrase_list, word_scores)
        sorted_keywords = sorted(keyword_candidates.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)
        r = rake.Rake(stop_path)
        keywords = r.run(text)
        print(keywords[0][0])
コード例 #4
0
def generate_heatmap_data():
    """Generates all heatmap data."""
    # Fetch all dates.
    dates = list(hansard.db.query(hansard.Hansard.date).distinct())
    # Fetch all locations.
    with open('name_to_loc.json') as f:
        loc_to_pos = json.load(f)
        locations = sorted(loc_to_pos.keys())
    # Assign each location an ID based on its index.
    loc_to_id = {j: i for i, j in enumerate(locations)}
    # Generate a NumPy array of dates x locations.
    data = numpy.zeros((len(dates), len(locations)))
    # For each date and location, count Hansard items.
    for i, (date, ) in enumerate(dates):
        results = hansard.Hansard.query.filter_by(date=date).all()
        for result in results:
            j = loc_to_id[result.location]
            data[i, j] += result.hits
    # Compute the mean heatmap.
    mean = data.mean(axis=0)
    # Divide out to normalise.
    data /= mean + 1

    # Get the top n peaks for each date.
    all_peaks = numpy.argsort(data, axis=1)[:, -N_PEAKS:]

    # For each date, dump to a mildly horrifying JSON string. Then store in the
    # database!
    for i, (date, ) in enumerate(dates):
        # [{name, lat, lon, weight}]
        heat = []
        # {location: [lat, lon]}
        peaks = {}
        # {location: {title, description}}
        topics = {}
        for j, location in enumerate(locations):
            if data[i, j] != 0:
                lat, lon = loc_to_pos[location]
                heat.append({
                    'name': location,
                    'lat': lat,
                    'lon': lon,
                    'weight': data[i, j],
                })

        for location_idx in all_peaks[i]:
            location = locations[location_idx]
            lat, lon = loc_to_pos[location]
            peaks[location] = (lat, lon)

            h = hansard.Hansard.query.filter_by(date=date,
                                                location=location).first()
            text = h.text.decode('utf-8')
            # From github.com/aneesha/RAKE.
            sentence_list = rake.split_sentences(text)
            stop_path = 'RAKE/SmartStoplist.txt'
            stopword_pattern = rake.build_stop_word_regex(stop_path)
            phrase_list = rake.generate_candidate_keywords(
                sentence_list, stopword_pattern)
            word_scores = rake.calculate_word_scores(phrase_list)
            keyword_candidates = rake.generate_candidate_keyword_scores(
                phrase_list, word_scores)
            sorted_keywords = sorted(keyword_candidates.items(),
                                     key=operator.itemgetter(1),
                                     reverse=True)
            r = rake.Rake(stop_path)
            keywords = r.run(text)
            topic = keywords[0][0]
            # Grab a description. We'll use the longest text that includes the
            # topic.
            subtexts = text.split('|')
            best_desc = ''
            for subtext in subtexts:
                if len(subtext) <= len(best_desc):
                    continue

                assert type(topic) == type(subtext)

                if topic.lower() in subtext.lower():
                    best_desc = subtext

            topics[location] = {
                'topic': topic,
                'description': best_desc,
                'hansard_ids': h.hids.split('|'),
            }

        heat = json.dumps(heat)
        peaks = json.dumps(peaks)
        topics = json.dumps(topics)

        dh = database.DateHeat(date, heat, peaks, topics)
        database.db_session.add(dh)
        database.db_session.commit()
コード例 #5
0
def getKeywords(text):
    rake_object = rake.Rake("RAKE/SmartStoplist.txt", 3, 3, 1)
    return rake_object.run(text)
コード例 #6
0
def get_keywords(url):
    text = get_text2(url)
    # print text
    rake_object = rake.Rake("RAKE/SmartStoplist.txt", 5, 3, 4)
    keywords = rake_object.run(text)
    return keywords