Ejemplo n.º 1
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Counts number of occurrences of keywords and groups by word.

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <WORD>:
          [
            [<YEAR>, <NUM_WORDS>],
            ...
          ],
          <WORD>:
          ...
        }

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by word
    :rtype: dict
    """
    keywords = []
    with open(config_file, "r") as f:
        keywords = [query_utils.normalize(word) for word in list(f)]
    # [(year, document), ...]
    documents = archives.flatMap(lambda archive: [(document.year, document) for
                                                  document in list(archive)])
    # [((year, word), 1), ...]
    words = documents.flatMap(
        lambda year_document: [((year_document[0], query_utils.normalize(
            word)), 1) for (_, word) in year_document[1].scan_words()])
    # [((year, word), 1), ...]
    matching_words = words.filter(
        lambda yearword_count: yearword_count[0][1] in keywords)

    # [((year, word), num_words), ...]
    # =>
    # [(word, (year, num_words)), ...]
    # =>
    # [(word, [year, num_words]), ...]
    result = matching_words \
        .reduceByKey(add) \
        .map(lambda yearword_count:
             (yearword_count[0][1],
              (yearword_count[0][0], yearword_count[1]))) \
        .groupByKey() \
        .map(lambda year_wordcount:
             (year_wordcount[0], list(year_wordcount[1]))) \
        .collect()
    return result
Ejemplo n.º 2
0
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Finds every unique word and its frequency.

    config_file can be the path to a configuration file with a
    threshold, the minimum number of occurrences of the word for the
    word to be counted. This file, if provided, must be of form:

        threshold: <COUNT>

    where <COUNT> is >= 1.

    If no configuration file is provided then a threshold of 1 is
    assumed.

    Words in documents are normalized, by removing all non-'a-z|A-Z'
    characters.

    Returns result of form:

        {
          <WORD>: <COUNT>,
          ...
        }

    :param issues: RDD of defoe.papers.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file (optional)
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: total number of issues and words
    :rtype: dict
    """
    threshold = 1
    if config_file is not None and\
       os.path.exists(config_file) and\
       os.path.isfile(config_file):
        with open(config_file, "r") as f:
            config = yaml.load(f)
        value = config["threshold"]
        threshold = max(threshold, value)

    # [article, article, ...]
    articles = issues.flatMap(
        lambda issue: [article for article in issue.articles])

    # [(word, 1), (word, 1), ...]
    words = articles.flatMap(lambda article: [(query_utils.normalize(word), 1)
                                              for word in article.words])

    # [(word, 1), (word, 1), ...]
    # =>
    # [(word, count), (word, count), ...]
    word_counts = words. \
        reduceByKey(add). \
        filter(lambda word_year: word_year[1] > threshold). \
        collect()
    return word_counts
Ejemplo n.º 3
0
def get_colocates_matches(document, start_word, end_word, window=0):
    """
    Get pages within a document that include colocates, one word
    followed by another word, with 0 or more intervening words.

    For each span of text, '<START_WORD> ... <END_WORD>', delimited by
    the colocates, a dictionary of the following form is included in
    the list returned:

        {
          "start_page": <PAGE_CODE>,
          "end_page": <PAGE_CODE>,
          "span": [<START_WORD>, ..., <END_WORD>]
        }

    :param document: document
    :type document: defoe.nls.document.Document
    :param start_word: start_word colocate
    :type start_word: str or unicode
    :param end_word: end_word colocate
    :type end_word: str or unicode
    :return: list of dicts
    :rtype: list(dict)
    """
    start_page = None
    span = []
    span_length = 0
    matches = []
    window_plus_colocates = window + 2
    for page, word in document.scan_words():
        normalized_word = query_utils.normalize(word)
        if not normalized_word:
            continue
        if normalized_word == start_word:
            start_page = page
            span = []
            span_length = 0
        if start_page is not None:
            span.append(normalized_word)
            span_length += 1
            if span_length > window_plus_colocates:
                start_page = None
                span = []
                span_length = 0
                continue
            if normalized_word == end_word:
                matches.append({
                    "start_page": str(start_page.code),
                    "end_page": str(page.code),
                    "span": span
                })
                start_page = None
                span = []
                span_length = 0
    return matches
Ejemplo n.º 4
0
def get_colocates_matches(article, start_word, end_word, window=0):
    """
    Get text within an article that include colocates, one word
    followed by another word, with 0 or more intervening words.

    A list of lists of each span of text, '<START_WORD>
    ... <END_WORD>', delimited by the colocates, is returned.

    :param article: article
    :type article: defoe.papers.article.Article
    :param start_word: start_word colocate
    :type start_word: str or unicode
    :param end_word: end_word colocate
    :type end_word: str or unicode
    :return: list of lists of words
    :rtype: list(list(str or unicode))
    """
    in_span = False
    span = []
    span_length = 0
    matches = []
    window_plus_colocates = window + 2
    for word in article.words:
        normalized_word = query_utils.normalize(word)
        if not normalized_word:
            continue
        if normalized_word == start_word:
            in_span = True
            span = []
            span_length = 0
        if in_span:
            span.append(normalized_word)
            span_length += 1
            if span_length > window_plus_colocates:
                in_span = False
                span = []
                span_length = 0
                continue
            if normalized_word == end_word:
                matches.append(span)
                in_span = False
                span = []
                span_length = 0
    return matches
Ejemplo n.º 5
0
def article_idx_to_words_row(article_idx):
    """
    Given a tuple with an article and an index, return a Row with the
    index ad a list of the words in the article.

    The words in the article are normalized, by removing all
    non-'a-z|A-Z' characters.

    Any stop words (words of less than 2 characters) are ignored.

    :param article_idx: tuple
    :type article_idx: tuple(defoe.papers.article.Article, int)
    :return: Row
    :rtype: pyspark.sql.Row
    """
    article, idx = article_idx
    words = []
    for word in article.words:
        normalized_word = query_utils.normalize(word)
        if len(word) > 2:   # Anything less is a stop word
            words.append(normalized_word)
    return Row(idx=idx, words=words)
Ejemplo n.º 6
0
def do_query(archives, config_file=None, logger=None):
    """
    Gets colocated words and groups by year.

    config_file must be the path to a configuration file with the
    words to be searched for and the maximum number of intervening
    words (a "window"). This file must be a YAML document of form:

        start_word: <WORD>
        end_word: <WORD>
        window: <WINDOW>

    where <WINDOW> is greater than or equal to 0. If omitted then a
    default of 0 is assumed.

    Both colocated words and words in documents are normalized, by
    removing all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <YEAR>:
          [
            {
              "document_id": <DOCUMENT_ID>,
              "place": <PLACE>,
              "publisher": <PUBLISHER>,
              "filename": <FILENAME>
              "matches":
              [
                {
                  "start_page": <PAGE_ID>,
                  "end_page": <PAGE_ID>,
                  "span": [<WORD>, ..., <WORD>]
                },
                ...
              ]
            },
            ...
          ],
          <YEAR>:
          ...
        }

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by year
    :rtype: dict
    """
    window = 0
    if config_file is not None and\
       os.path.exists(config_file) and\
       os.path.isfile(config_file):
        with open(config_file, "r") as f:
            config = yaml.load(f)
        start_word = query_utils.normalize(config["start_word"])
        end_word = query_utils.normalize(config["end_word"])
        window = config["window"]
        if window < 0:
            raise ValueError('window must be at least 0')

    # [document, ...]
    documents = archives.flatMap(
        lambda archive: [document for document in list(archive)])

    # [(document, matches), ...]
    colocated_words = documents.map(lambda document: (
        document, get_colocates_matches(document, start_word, end_word, window
                                        )))
    # [(document, matches), ...]
    colocated_words = colocated_words.filter(
        lambda document_matches: len(document_matches[1]) > 0)

    # [(document, matches), ...]
    # =>
    # [(year, {"title": title, ...}), ...]
    matching_docs = colocated_words.map(
        lambda document_matches: (document_matches[0].year, {
            "title": document_matches[0].title,
            "place": document_matches[0].place,
            "publisher": document_matches[0].publisher,
            "document_id": document_matches[0].code,
            "filename": document_matches[0].archive.filename,
            "matches": document_matches[1]
        }))

    # [(year, {"title": title, ...}), ...]
    # =>
    # [(year, [{"title": title, ...], {...}), ...)]
    result = matching_docs \
        .groupByKey() \
        .map(lambda year_context:
             (year_context[0], list(year_context[1]))) \
        .collect()
    return result
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Gets concordance for keywords and groups by year.

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <YEAR>:
          [
            {
              "title": <TITLE>,
              "place": <PLACE>,
              "publisher": <PUBLISHER>,
              "page_number": <PAGE_NUMBER>,
              "content": <PAGE_CONTENT>,
              "word": <WORD>,
              "document_id": <DOCUMENT_ID>,
              "filename": <FILENAME>
             },
             ...
          ],
          <YEAR>:
          ...
        }

    :param archives: RDD of defoe.alto.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by year
    :rtype: dict
    """
    keywords = []
    with open(config_file, "r") as f:
        keywords = [query_utils.normalize(word) for word in list(f)]
    # [document, ...]
    documents = archives.flatMap(
        lambda archive: [document for document in list(archive)])

    # [(year, document, page, word), ...]
    filtered_words = documents.flatMap(
        lambda document: get_page_matches(document, keywords))

    # [(year, document, page, word), ...]
    # =>
    # [(year, {"title": title, ...}), ...]
    matching_docs = filtered_words.map(lambda year_document_page_word: (
        year_document_page_word[0], {
            "title": year_document_page_word[1].title,
            "place": year_document_page_word[1].place,
            "publisher": year_document_page_word[1].publisher,
            "page_number": year_document_page_word[2].code,
            "content": year_document_page_word[2].content,
            "word": year_document_page_word[3],
            "document_id": year_document_page_word[1].code,
            "filename": year_document_page_word[1].archive.filename
        }))

    # [(year, {"title": title, ...}), ...]
    # =>
    # [(year, [{"title": title, ...], {...}), ...)]
    result = matching_docs \
        .groupByKey() \
        .map(lambda year_context:
             (year_context[0], list(year_context[1]))) \
        .collect()
    return result
Ejemplo n.º 8
0
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Counts number of articles containing two or more keywords and
    groups by year.

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <YEAR>:
          [
            {
              "words": [<WORD>, <WORD>, ...],
              "count": <COUNT>
            },
            ...
          ],
          ...
        }

    :param issues: RDD of defoe.papers.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    keywords = []
    with open(config_file, "r") as f:
        keywords = [query_utils.normalize(word) for word in list(f)]
    # [(year, article), ...]
    articles = issues.flatMap(lambda issue: [(issue.date.year, article)
                                             for article in issue.articles])
    # [((year, [word, word, ...]), 1), ...]
    words = articles.map(lambda year_article: (
        (year_article[0],
         get_article_keywords(year_article[1], keywords, PreprocessWordType.
                              NORMALIZE)), 1))
    # [((year, [word, word, ...]), 1), ...]
    match_words = words.filter(
        lambda yearword_count: len(yearword_count[0][1]) > 1)
    # [((year, "word, word, ..."), 1), ...]
    # Convert word list to string so can serve as a key.
    multi_words = match_words.map(lambda yearword_count: ((yearword_count[0][
        0], ",".join(yearword_count[0][1])), yearword_count[1]))
    # [((year, "word, word, ..."), 1), ...]
    # =>
    # [((year, "word, word, ..."), count), ...]
    # =>
    # [((year, ("word, word, ...", count)), ...]
    # =>
    # [((year, [{"words": [word, word, ...],
    #            "count": count}, ...],
    #          ...]
    # list of words is restored from string of words.
    result = multi_words \
        .reduceByKey(add) \
        .map(lambda yearword_count:
             (yearword_count[0][0],
              (yearword_count[0][1],
               yearword_count[1]))) \
        .groupByKey() \
        .map(lambda year_wordcount:
             (year_wordcount[0],
              word_article_count_list_to_dict(year_wordcount[1])))\
        .collect()
    return result
Ejemplo n.º 9
0
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Gets the Latent Dirochelet Allocation (LDA) topics for words
    within articles.

    config_file must be the path to a LDA configuration file in YAML
    format. For example:

        keyword: <KEYWORD>
        optimizer: online|em
        max_iterations: <N>
        ntopics: <N>
        topic_words: <N>

    <N> must be >= 1 for each parameter.

    The keyword and words in documents are normalized, by removing all
    non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <0>: [<WORD_0>, ..., <WORD_topicwords>],
          <1>: [<WORD_0>, ..., <WORD_topicwords>],
          <2>: [<WORD_0>, ..., <WORD_topicwords>],
          ...
          <ntopics>: [<WORD_0>, ..., <WORD_topicwords>],
          years:[<MIN_YEAR>, <MAX_YEAR>]
        }

    :param issues: RDD of defoe.papers.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: LDA topics
    :rtype: dict
    """
    with open(config_file, 'r') as f:
        config = load(f)
        keyword = config['keyword']
        optimizer = config['optimizer']
        if optimizer != 'online' and optimizer != 'em':
            raise ValueError("optmizer must be 'online' or 'em' but is '{}'"
                             .format(optimizer))
        max_iterations = config['max_iterations']
        if max_iterations < 1:
            raise ValueError('max_iterations must be at least 1')
        ntopics = config['ntopics']
        if ntopics < 1:
            raise ValueError('ntopics must be at least 1')
        topic_words = config['topic_words']
        if topic_words < 1:
            raise ValueError('topic_words must be at least 1')

    keyword = query_utils.normalize(keyword)

    # [date, ...]
    # =>
    # [(yesr, year), ...]
    # =>
    # (year, year)
    min_year, max_year = issues \
        .filter(lambda issue: issue.date) \
        .map(lambda issue: (issue.date.year, issue.date.year)) \
        .reduce(min_max_tuples)

    # [issue, issue, ...]
    # =>
    # [article, article, ...]
    # =>
    # [(article, 0), (article, 1), ...]
    # =>
    # [Row, Row, ...]
    articles_rdd = issues.flatMap(lambda issue: issue.articles) \
        .filter(lambda article:
                article_contains_word(article,
                                      keyword,
                                      PreprocessWordType.NORMALIZE)) \
        .zipWithIndex() \
        .map(article_idx_to_words_row)

    spark = SparkSession \
        .builder \
        .appName('lda') \
        .getOrCreate()

    articles_df = spark.createDataFrame(articles_rdd)

    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    articles_df = remover.transform(articles_df)

    vectortoriser = CountVectorizer(inputCol='filtered', outputCol='vectors')
    model = vectortoriser.fit(articles_df)

    vocabulary = model.vocabulary
    articles_df = model.transform(articles_df)

    corpus = articles_df \
        .select('idx', 'vectors') \
        .rdd \
        .map(lambda a: [a[0], Vectors.fromML(a[1])]) \
        .cache()

    # Cluster the documents into N topics using LDA.
    lda_model = LDA.train(corpus,
                          k=ntopics,
                          maxIterations=max_iterations,
                          optimizer=optimizer)
    topics_final = [topic_render(topic, topic_words, vocabulary)
                    for topic in lda_model.describeTopics(maxTermsPerTopic=topic_words)]

    topics = [('years', [min_year, max_year])]
    for i, topic in enumerate(topics_final):
        term_words = []
        for term in topic:
            term_words.append(term)
        topics.append((str(i), term_words))
    return topics
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Gets concordance for keywords and groups by date.

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <DATE>:
          [
            {
              "title": <TITLE>,
              "page_ids": <PAGE_IDS>,
              "content": <PAGE_CONTENT>,
              "word": <WORD>,
              "article_id": <ARTICLE_ID>,
              "issue_id": <ISSUE_ID>,
              "filename": <FILENAME>
            },
            ...
          ],
          <DATE>:
          ...
        }

    :param issues: RDD of defoe.alto.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    keywords = []
    with open(config_file, "r") as f:
        keywords = [query_utils.normalize(word) for word in list(f)]

    # [(date, issue, article, word), ...]
    filtered_words = issues.flatMap(lambda issue: get_article_matches(
        issue, keywords, PreprocessWordType.NORMALIZE))

    # [(date, issue, article, word), ...]
    # =>
    # [(date, {"title": title, ...}), ...]
    matching_docs = filtered_words.map(lambda date_issue_article_word: (
        date_issue_article_word[0], {
            "title": date_issue_article_word[2].title_string,
            "page_ids": list(date_issue_article_word[2].page_ids),
            "content": date_issue_article_word[2].words_string,
            "word": date_issue_article_word[3],
            "article_id": date_issue_article_word[2].article_id,
            "issue_id": date_issue_article_word[1].newspaper_id,
            "filename": date_issue_article_word[1].filename
        }))

    # [(date, {"title": title, ...}), ...]
    # =>
    # [(date, [{"title": title, ...], {...}), ...)]
    result = matching_docs \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
Ejemplo n.º 11
0
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Gets colocated words and groups by year.

    config_file must be the path to a configuration file with the
    words to be searched for and the maximum number of intervening
    words (a "window"). This file must be a YAML document of form:

        start_word: <WORD>
        end_word: <WORD>
        window: <WINDOW>

    where <WINDOW> is greater than or equal to 0. If omitted then a
    default of 0 is assumed.

    Both colocated words and words in articles are normalized, by
    removing all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <YEAR>:
          [
            {
              "article_id": <ARTICLE_ID>,
              "issue_id": <ISSUE_ID>,
              "page_ids": <PAGE_IDS>,
              "filename": <FILENAME>,
              "matches":
              [
                [<WORD>, ..., <WORD>],
                ...
              ]
            },
            ...
          ],
          <YEAR>:
          ...
        }

    :param issues: RDD of defoe.alto.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on articles in which keywords occur grouped
    by year
    :rtype: dict
    """
    window = 0
    if config_file is not None and\
       os.path.exists(config_file) and\
       os.path.isfile(config_file):
        with open(config_file, "r") as f:
            config = yaml.load(f)
        start_word = query_utils.normalize(config["start_word"])
        end_word = query_utils.normalize(config["end_word"])
        window = config["window"]
        if window < 0:
            raise ValueError('window must be at least 0')

    # [(issue, article), ...]
    issue_articles = issues.flatMap(
        lambda issue: [(issue, article) for article in issue.articles])

    # [(issue, article, matches), ...]
    colocated_words = issue_articles.map(lambda issue_article: (
        issue_article[0], issue_article[1],
        get_colocates_matches(issue_article[1], start_word, end_word, window)))
    # [(issue, article, matches), ...]
    colocated_words = colocated_words.filter(
        lambda issue_article_matches: len(issue_article_matches[2]) > 0)

    # [(issue, article, matches), ...]
    # =>
    # [(year, {"title": title, ...}), ...]
    matching_articles = colocated_words.map(lambda issue_article_matches: (
        issue_article_matches[0].date.year, {
            "title": issue_article_matches[1].title_string,
            "article_id": issue_article_matches[1].article_id,
            "page_ids": list(issue_article_matches[1].page_ids),
            "issue_id": issue_article_matches[0].newspaper_id,
            "filename": issue_article_matches[0].filename,
            "matches": issue_article_matches[2]
        }))

    # [(year, {"title": title, ...}), ...]
    # =>
    # [(year, [{"title": title, ...], {...}), ...)]
    result = matching_articles \
        .groupByKey() \
        .map(lambda year_context:
             (year_context[0], list(year_context[1]))) \
        .collect()
    return result