Ejemplo n.º 1
0
 def _calculate_word_frequencies(self, text):
     text = common_utils.prepare_text(text)
     words = common_utils.tokenize(text)
     res = collections.defaultdict(int)
     for word in words:
         res[word] += 1
     return res
Ejemplo n.º 2
0
 def _calculate_word_frequencies(self, text):
     text = common_utils.prepare_text(text)
     words = common_utils.tokenize(text)
     res = collections.defaultdict(int)
     for word in words:
         res[word] += 1
     return res
Ejemplo n.º 3
0
def keyphrases_table(keyphrases,
                     texts,
                     similarity_measure=None,
                     synonimizer=None,
                     language=consts.Language.ENGLISH):
    """
    Constructs the keyphrases table, containing their matching scores in a set of texts.

    The resulting table is stored as a dictionary of dictionaries,
    where the entry table["keyphrase"]["text"] corresponds
    to the matching score (0 <= score <= 1) of keyphrase "keyphrase"
    in the text named "text".
    
    :param keyphrases: list of strings
    :param texts: dictionary of form {text_name: text}
    :param similarity_measure: similarity measure to use
    :param synonimizer: SynonymExtractor object to be used
    :param language: Language of the text collection / keyphrases

    :returns: dictionary of dictionaries, having keyphrases on its first level and texts
              on the second level.
    """

    similarity_measure = similarity_measure or relevance.ASTRelevanceMeasure()

    text_titles = texts.keys()
    text_collection = texts.values()
    similarity_measure.set_text_collection(text_collection, language)

    i = 0
    keyphrases_prepared = {
        keyphrase: utils.prepare_text(keyphrase)
        for keyphrase in keyphrases
    }
    total_keyphrases = len(keyphrases)
    total_scores = len(text_collection) * total_keyphrases
    res = {}
    for keyphrase in keyphrases:
        if not keyphrase:
            continue
        res[keyphrase] = {}
        for j in range(len(text_collection)):
            i += 1
            logging.progress("Calculating matching scores", i, total_scores)
            res[keyphrase][text_titles[j]] = similarity_measure.relevance(
                keyphrases_prepared[keyphrase],
                text=j,
                synonimizer=synonimizer)

    logging.clear()

    return res
Ejemplo n.º 4
0
def keyphrases_table(keyphrases, texts, similarity_measure=None, synonimizer=None,
                     language=consts.Language.ENGLISH):
    """
    Constructs the keyphrases table, containing their matching scores in a set of texts.

    The resulting table is stored as a dictionary of dictionaries,
    where the entry table["keyphrase"]["text"] corresponds
    to the matching score (0 <= score <= 1) of keyphrase "keyphrase"
    in the text named "text".
    
    :param keyphrases: list of strings
    :param texts: dictionary of form {text_name: text}
    :param similarity_measure: similarity measure to use
    :param synonimizer: SynonymExtractor object to be used
    :param language: Language of the text collection / keyphrases

    :returns: dictionary of dictionaries, having keyphrases on its first level and texts
              on the second level.
    """

    similarity_measure = similarity_measure or relevance.ASTRelevanceMeasure()

    text_titles = texts.keys()
    text_collection = texts.values()
    similarity_measure.set_text_collection(text_collection, language)

    i = 0
    keyphrases_prepared = {keyphrase: utils.prepare_text(keyphrase)
                           for keyphrase in keyphrases}
    total_keyphrases = len(keyphrases)
    total_scores = len(text_collection) * total_keyphrases
    res = {}
    for keyphrase in keyphrases:
        if not keyphrase:
            continue
        res[keyphrase] = {}
        for j in xrange(len(text_collection)):
            i += 1
            logging.progress("Calculating matching scores", i, total_scores)
            res[keyphrase][text_titles[j]] = similarity_measure.relevance(
                                                        keyphrases_prepared[keyphrase],
                                                        text=j, synonimizer=synonimizer)

    logging.clear()

    return res
Ejemplo n.º 5
0
    def set_text_collection(self, texts, language=consts.Language.ENGLISH):
        self.language = language
        if self.vector_space == consts.VectorSpace.STEMS:
            self.stemmer = snowball.SnowballStemmer(self.language)
        raw_tokens = []
        total_texts = len(texts)
        for i in xrange(total_texts):
            raw_tokens.append(
                utils.tokenize_and_filter(utils.prepare_text(texts[i])))
            logging.progress("Preparing texts", i + 1, total_texts)

        logging.clear()

        # Convert to stems or lemmata, depending on the vector space type
        preprocessed_tokens = self._preprocess_tokens(raw_tokens)

        # Terms define the vector space (they can be words, stems or lemmata). They should be
        # defined once here because they will be reused when we compute td-idf for queries
        self.terms = list(set(utils.flatten(preprocessed_tokens)))
        self.tf, self.idf = self._tf_idf(preprocessed_tokens)
Ejemplo n.º 6
0
def main():
    args = sys.argv[1:]
    opts, args = getopt.getopt(args, "a:f:l:t:ds")
    opts = dict(opts)
    opts.setdefault("-a", "easa")   # Algorithm to use for computing ASTs
    opts.setdefault("-l", "0.6")    # Level of significance for graph construction
    opts.setdefault("-t", "0.25")   # Threshold of the matching score
    # NOTE(msdubov): -f (output format) option takes different values for different
    #                subcommands and its default value is set in corresponding handlers.

    if len(args) < 2:
        print("Invalid syntax: EAST should be called as:\n\n"
              "    east <command> <subcommand> [options] args\n\n"
              "Commands available: keyphrases.\n"
              "Subcommands available: table/graph.")
        return 1

    command = args[0]
    subcommand = args[1]

    if command == "keyphrases":

        if len(args) < 4:
            print('Invalid syntax. For keyphrases analysis, EAST should be called as:\n\n'
                  '    east keyphrases <subcommand> [options] "path/to/keyphrases.txt" '
                  '"path/to/texts/dir"')
            return 1

        keyphrases_file = os.path.abspath(args[2])
        input_path = os.path.abspath(args[3])
        use_synonyms = "-s" in opts
        normalized_scores = "-d" not in opts
        ast_algorithm = opts["-a"]
        significance_level = float(opts["-l"])
        score_threshold = float(opts["-t"])

        if os.path.isdir(input_path):
            input_files = [os.path.abspath(input_path) + "/" + filename
                           for filename in os.listdir(input_path)
                           if filename.endswith(".txt")]
        else:
            input_files = [os.path.abspath(input_path)]

        texts = {}
        for filename in input_files:
            with open(filename) as f:
                text_name = os.path.basename(filename).decode("utf-8")[:-4]
                texts[text_name] = f.read()

        with open(keyphrases_file) as f:
            keyphrases = map(lambda k: utils.prepare_text(k), f.read().splitlines())

        if use_synonyms:
            synonimizer = synonyms.SynonymExtractor(input_path)
        else:
            synonimizer = None

        if subcommand == "table":

            keyphrases_table = applications.keyphrases_table(keyphrases, texts, ast_algorithm,
                                                             normalized_scores, synonimizer)
            opts.setdefault("-f", "xml")  # Table output format (also "csv" possible)
            table_format = opts["-f"].lower()

            if table_format == "xml":
                res = formatting.table2xml(keyphrases_table)
            elif table_format == "csv":
                res = formatting.table2csv(keyphrases_table)
            else:
                print ("Unknown table format: '%s'. "
                       "Please use one of: 'xml', 'csv'." % table_format)
                return 1

            print res.encode("utf-8", "ignore")

        elif subcommand == "graph":

            graph = applications.keyphrases_graph(keyphrases, texts, significance_level,
                                                  score_threshold, ast_algorithm,
                                                  normalized_scores, synonimizer)

            opts.setdefault("-f", "edges")  # Graph output format (also "gml" possible)
            graph_format = opts["-f"].lower()

            if graph_format == "gml":
                res = formatting.graph2gml(graph)
            elif graph_format == "edges":
                res = formatting.graph2edges(graph)
            else:
                print ("Unknown graph format: '%s'. "
                       "Please use one of: 'gml', 'edges'." % graph_format)
                return 1

            print res.encode("utf-8", "ignore")

        else:
            print "Invalid subcommand: '%s'. Please use one of: 'table', 'graph'." % subcommand
            return 1

    else:
        print "Invalid command: '%s'. Please use one of: 'keyphrases'." % command
        return 1