Beispiel #1
0
def do_query(hansards, config_file=None, logger=None, context=None):

    with open(config_file, "r") as f:
        config = yaml.load(f)

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    print(f'preprocessing: {preprocess_type}')

    unproc_keywords = config['keywords']
    keywords = []
    for k in unproc_keywords:
        keywords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in k.split()
        ]))
    print(f'keywords: {keywords}')

    # [(year, discussion_string), ...]
    headings = hansards.flatMap(
        lambda hansard: [(hansard, h) for h in get_headings(hansard)])

    # [(discussion, clean_text), ...]
    discussion_text = headings.map(
        lambda hansard_discussion: (hansard_discussion[0], hansard_discussion[
            1], clean_text(hansard_discussion[1], preprocess_type)))

    # [(discussion, clean_text)
    filter_discussions = discussion_text.filter(
        lambda disc: any(k in disc[2] for k in keywords))

    # [(discussion, clean_text)
    matching_discussions = filter_discussions.map(
        lambda disc: (disc[0], disc[1], find_matches(disc[2], keywords)))

    matching_data = matching_discussions.flatMap(
        lambda discussion: [(discussion[1]._id, {
            "title":
            discussion[1].title,
            "heading_id":
            discussion[1]._id,
            "speech_id":
            speech._id,
            "speaker": ((speech.speaker.id, speech.speaker.name)
                        if speech.speaker is not None else ''),
            "text":
            speech.text,
            "filename":
            discussion[0].filename,
            "term":
            list(discussion[2]),
        }) for speech in discussion[1].speeches])

    result = matching_data \
        .groupByKey() \
        .map(lambda speech:
             (speech[0], list(speech[1]))) \
        .collect()
    return result
Beispiel #2
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Gets concordance and collocation analysis for keywords giving a target word,  and it groups the results by date.
    The window variable can be used for specifying the number of words to the right and left to take. 
  
 
    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.
    Keywords and words in documents are preprocessed, using one of the above options.
     Returns result of form:
        <YEAR>:
        - [<WORD>, <CONCORDANCE>]
        - [<WORD>, <CONCORDANCE>]

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    # [document, ...]
    documents = archives.flatMap(
        lambda archive: [document for document in list(archive)])

    # [(year, sentences), ...]
    raw_sentences = documents.flatMap(
        lambda document: extract_sentences(document))

    # [(year, [preprocess_sentences]), (year, [preprocessed_sentence] ) , ...]
    preprocessed_sentences = raw_sentences.flatMap(
        lambda raw_sentences: [total_preprocessed(raw_sentences)])

    print("Preprocessed  %s" % preprocessed_sentences.take(3))

    # [[year, [preprocess sentence],[preprocess sentence], year, []], ...]
    result = preprocessed_sentences.groupByKey() \
             .map(lambda year_wordcount:
              (year_wordcount[0], list(year_wordcount[1]))) \
            .collect()

    return result
Beispiel #3
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Writes pages (preprocessed or not) as string to HDFS textfiles, and some metadata associated with each document.
    If we have a config_file indiciating the preprocess treament, it will be to the words extracted from pages. Otherwise, non preprocess treatment will be applied.
    Metadata collected: tittle, edition, year, place, archive filename, page filename, page id, num pages, type of archive, model, type of preprocess treatment, prep_page_string

    Data is saved as RDD into HDFS textfiles 

    Example:
    ('Encyclopaedia Britannica; or, A dictionary of arts, sciences, and miscellaneous literature', 'Fourth edition ...', 
      1810, 'Edinburgh', '/mnt/lustre/at003/at003/rfilguei2/nls-data-encyclopaediaBritannica/191253839', 
      'alto/192209952.34.xml', 'Page5', 446, 'book', 'nls', <PreprocessWordType.NONE:4>, u"Part III. MORAL PHILOSOPHY.....)
    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: "0"
    :rtype: string
    """
    if config_file is not None:
        with open(config_file, "r") as f:
            config = yaml.load(f)
        preprocess_type = query_utils.extract_preprocess_word_type(config)
    else:
        preprocess_type = query_utils.parse_preprocess_word_type("none")
    documents = archives.flatMap(
        lambda archive: [(document.title, document.edition, str(document.year), \
                          document.place, document.archive.filename, str(document.num_pages), \
                           document.document_type, document.model, document) for document in list(archive)])
    # [(tittle, edition, year, place, archive filename, page filename,
    #   page id, num pages, type of archive, type of disribution, model, type of preprocess treatment, page_as_string)]
    pages = documents.flatMap(
        lambda year_document: [(year_document[0], year_document[1], year_document[2],\
                               year_document[3], year_document[4], page.code, page.page_id, \
                               year_document[5], year_document[6], year_document[7], str(preprocess_type), \
                               get_page_as_string(page, preprocess_type)) for page in year_document[8]])

    pages.saveAsTextFile("hdfs:///user/at003/rosa/demo_text4.txt")
    return "0"
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Gets concordance using a window of words (here it is configured to 40), for keywords and groups by date.
    Store the snippet (40 words before and after each term). 

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. 
    
    
    Returns result of form:
        {
          <YEAR>:
          [
            [- archive_filename: 
             - edition:
             - filename:
             - snippet:
             - term
             - title ]
            ...
          ],
          <YEAR>:
          ...
        }
  


    :param issues: RDD of defoe.alto.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    window = 40
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    preprocess_config = config["preprocess"]
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))

    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)

    # [(year, document, title, edition, archive_filename), ...]
    documents = archives.flatMap(lambda archive: [(
        document.year, document, document.title, document.edition, document.
        archive.filename) for document in list(archive)])
    # [(year, page_string)

    #(year, title, edition, archive_filename, page_code, clean_page_string)
    clean_pages = documents.flatMap(lambda year_document: [(year_document[
        0], year_document[2], year_document[3], year_document[
            4], page.code, clean_page_as_string(
                page, defoe_path, os_type)) for page in year_document[1]])

    #(year, title, edition, archive_filename, page_code, preprocess_clean_page)
    pages = clean_pages.flatMap(lambda cl_page: [(
        cl_page[0], cl_page[1], cl_page[2], cl_page[3], cl_page[4],
        preprocess_clean_page(cl_page[5], preprocess_type))])
    #(year, title, edition, archive_filename, page_code, preprocess_clean_page)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[5] for keysentence in keysentences))

    # [(year, title, edition, archive_filename, filename, text, [(word, idx), (word, idx) ...]), ...]
    maching_idx = filter_pages.map(lambda year_page: (
        (year_page[0], year_page[1], year_page[2], year_page[3], year_page[4],
         year_page[5], get_text_keysentence_idx(year_page[5], keysentences))))

    # [(year, [(title, edition, archive_filename, filename, word, [concordance, ...]), ...])]
    concordance_words = maching_idx.flatMap(
        lambda year_idx: [
            (year_idx[0],
                {"title":year_idx[1], "edition": year_idx[2], "archive_filename": year_idx[3], "filename":year_idx[4], "term": word_idx[0],\
                 "snippet": get_concordance_string(year_idx[5], word_idx[0], word_idx[1], window)})\
                 for word_idx in year_idx[6]])

    result = concordance_words.groupByKey() \
        .map(lambda year_match:
             (year_match[0], list(year_match[1]))) \
        .collect()
    return result
Beispiel #5
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Gets concordance for keywords and groups by word.

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <WORD>:
          [
            { "article_id": <ARTICLE ID>,
              "issue_filename": <ISSUE.ZIP>, 
              "coord": <COORDENATES>, 
              "page_area": <PAGE AREA>,
              "page_filename": < PAGE FILENAME>,
              "place": <PLACE>,
              "textblock_id": <TEXTBLOCK ID>,
              "title": <TITLER>,
              "words": <WORDS>
              "year": <YEAR>,
            },
            ...
          ],
          <WORD>:
          ...
        }

    :param archives: RDD of defoe.fmp.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by word
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))

    keywords = []
    with open(data_file, 'r') as f:
        keywords = [
            query_utils.preprocess_word(word, preprocess_type)
            for word in list(f)
        ]

    # [document, ...]
    documents = archives.flatMap(
        lambda archive: [document for document in list(archive)])

    filtered_words = documents.flatMap(lambda document: get_article_matches(
        document, keywords, preprocess_type))
    #[(year, document, article, textblock_id, textblock_coords, textblock_page_area, words, page_name, keyword), ....]
    # =>
    # [(word, {"article_id": article_id, ...}), ...]
    matching_docs = filtered_words.map(lambda document_article_word: (
        document_article_word[8], {
            "title": document_article_word[1].title,
            "place": document_article_word[1].place,
            "article_id": document_article_word[2],
            "textblock_id": document_article_word[3],
            "coord": document_article_word[4],
            "page_area": document_article_word[5],
            "year": document_article_word[0],
            "words": document_article_word[6],
            "page_filename": document_article_word[7],
            "issue_filename": document_article_word[1].archive.filename
        }))

    # [(word, {"article_id": article_id, ...}), ...]
    # =>
    # [(word, [{"article_id": article_id, ...], {...}), ...)]
    result = matching_docs \
        .groupByKey() \
        .map(lambda word_context:
             (word_context[0], list(word_context[1]))) \
        .collect()
    return result
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Counts the number of occurrences using a list of  keywords or keysentences and 
    filtering those by date. Results are grouped by year.

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. We can also configure how many target words 
    we want to use, and in which position the lexicon words starts. 
    
    For indicating the number of target words to use from the lexicon file, we can indicate it 
    in the configuration file as, num_target: 1. That means, that we have only one word/sentence
    as the target word (the first one). 
    
    If we want to include the target words in the lexicon, we should indicate in 
    the configuration file as, lexicon_start: 0.
    
    If we do not want to include the target words (lets image that we have just one target word) 
    in the lexicon, we should indicate in the configuration file as, lexicon_start: 1.
    
    Finally, to select the dates that we want to use in this query, we have to indicate them
    in the configuration file as follows:
    
      start_year: YEAR_START (including that year)
      end_year: YEAR_FINISH (including that year)

    Returns result of form:

        {
          <YEAR>:
          [
            [<SENTENCE|WORD>, <NUM_SENTENCES|WORDS>],
            ...
          ],
          <YEAR>:
          ...
        }

    :param issues: RDD of defoe.papers.issue.Issue
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))

    start_year = int(config["start_year"])
    end_year = int(config["end_year"])
    num_target = int(config["num_target"])
    lexicon_start = int(config["lexicon_start"])

    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, article_string), ...]

    target_sentences = keysentences[0:num_target]
    keysentences = keysentences[lexicon_start:]
    clean_articles = issues.flatMap(lambda issue: [(
        issue.date.year, clean_article_as_string(article, defoe_path, os_type)
    ) for article in issue.articles if int(
        issue.date.year) >= start_year and int(issue.date.year) <= end_year])

    # [(year, preprocess_article_string), ...]
    t_articles = clean_articles.flatMap(lambda cl_article: [(cl_article[
        0], preprocess_clean_article(cl_article[1], preprocess_type))])

    # [(year, clean_article_string)
    target_articles = t_articles.filter(lambda year_article: any(
        target_s in year_article[1] for target_s in target_sentences))

    # [(year, clean_article_string)
    filter_articles = target_articles.filter(lambda year_article: any(
        keysentence in year_article[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    # Note: get_articles_list_matches ---> articles count
    # Note: get_sentences_list_matches ---> word_count
    matching_articles = filter_articles.map(lambda year_article: (year_article[
        0], get_articles_list_matches(year_article[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_articles.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
Beispiel #7
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Crops articles' images for keywords and groups by word.

    Config_file must a yml file that has the following values:
        * preprocess: Treatment to use for preprocessing the words. Options: [normalize|stem|lemmatize|none]
        * data: TXT file with a list of the keywords to search for, one per line. 
                This should be in the same path at the configuration file.

        Important!! : The first two words in this list are treated as targetwords.

        *years_filter: Min and Max years to filter the data. Separeted by "-"
        *output_path: The path to store the cropped images.

    Returns result of form:

        {
          <WORD>:
          [
            { "article_id": <ARTICLE ID>,
              "issue_filename": <ISSUE.ZIP>, 
              "issue_id": <ISSUE ID>
              "coord": <COORDENATES>,
              "cropped_image": <IMAGE.JPG> 
              "page_area": <PAGE AREA>,
              "page_filename": < PAGE FILENAME>,
              "place": <PLACE>,
              "textblock_id": <TEXTBLOCK ID>,
              "title": <TITLER>,
              "words": <WORDS>,
              "preprocessed_words": <PREPROCESSED WORDS> 
              "year": <YEAR>,
              "date": <DATE>
            },
            ...
          ],
          <WORD>:
          ...
        }
    :param archives: RDD of defoe.fmp.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by word
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    year_min, year_max=query_utils.extract_years_filter(config)
    output_path = query_utils.extract_output_path(config)
    keywords = []
    with open(data_file, 'r') as f:
        keywords = [query_utils.preprocess_word(word, preprocess_type)
                    for word in list(f)]

    
    #We can change the following line, if we want to include more or less words as target_words.
    #In this case, the first two words of the lexicon are selected as target_words
    target_words = keywords[0:2]
    #The rest of words of the lexicon are selected as keywords
    keywords = keywords[2:]
    # [document, ...]

    # We will select/filter the texblocks that follows this rule: The text contains at least one target words AND one keyword. 
    

    documents = archives.flatMap(
        lambda archive: [document for document in list(archive) if document.year >= int(year_min) and document.year <= int(year_max) ])

    filtered_tb = documents.flatMap(
        lambda document: get_article_matches(document, target_words, preprocess_type))
    
    filtered_words = filtered_tb.flatMap(lambda tb: get_tb_matches(tb, keywords))

    #[(year, document, article, textblock_id, textblock_coords, textblock_page_area, words, preprocessed_words, page_name, keyword,target), ....]
    # [(word, {"article_id": article_id, ...}), ...]
    matching_docs = filtered_words.map(
        lambda document_article_word:
        (document_article_word[9],
         {"title": document_article_word[1].title,
          "place": document_article_word[1].place,
          "article_id": document_article_word[2],
          "textblock_id": document_article_word[3], 
          "coord": document_article_word[4],
          "page_area": document_article_word[5],
          "year": document_article_word[0],
          "words":  document_article_word[6],
          "date":  document_article_word[1].date,
          "preprocessed_words":  document_article_word[7],
          "page_filename":  document_article_word[8],
          "issue_id": document_article_word[1].documentId,
          "issue_dirname": document_article_word[1].archive.filename,
          "target_word": document_article_word[10],
          "cropped_image": segment_image(document_article_word[4], document_article_word[8], document_article_word[1].archive.filename, document_article_word[9], output_path, document_article_word[10])
         }))


    # [(word, {"article_id": article_id, ...}), ...]
    # =>
    # [(word, [{"article_id": article_id, ...], {...}), ...)]
    result = matching_docs \
        .groupByKey() \
        .map(lambda word_context:
             (word_context[0], list(word_context[1]))) \
        .collect()
    return result
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Select the articles text along with metadata by using a list of 
    keywords or keysentences and groups by year. In the result file we
    store the preproessed text instead of the original text. 
    
    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. We can also configure how many target words 
    we want to use, and in which position the lexicon words starts. 
    
    For indicating the number of target words to use from the lexicon file, we can indicate it 
    in the configuration file as, num_target: 1. That means, that we have only one word/sentence
    as the target word (the first one). 
    
    If we want to include the target words in the lexicon, we should indicate in 
    the configuration file as, lexicon_start: 0.
    
    If we do not want to include the target words (lets image that we have just one target word) 
    in the lexicon, we should indicate in the configuration file as, lexicon_start: 1.

    Returns result of form:

        {
          <YEAR>:
          [
            [- article_id: 
             - authors:
             - filename:
             - issue_id:
             - page_ids:
             - preprocessed_text:
             - term
             - title ]
            ...
          ],
          <YEAR>:
          ...
        }
        


    :param issues: RDD of defoe.papers.issue.Issue
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    num_target = int(config["num_target"])
    lexicon_start = int(config["lexicon_start"])
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, article_string), ...]

    target_sentences = keysentences[0:num_target]
    keysentences = keysentences[lexicon_start:]
    clean_articles = issues.flatMap(
        lambda issue: [(issue.date.year, issue, article,
                        clean_article_as_string(article, defoe_path, os_type))
                       for article in issue.articles])

    # [(year, preprocess_article_string), ...]
    t_articles = clean_articles.flatMap(
        lambda cl_article: [(cl_article[0], cl_article[1], cl_article[
            2], preprocess_clean_article(cl_article[3], preprocess_type))])

    # [(year, clean_article_string)
    target_articles = t_articles.filter(lambda year_article: any(
        target_s in year_article[3] for target_s in target_sentences))

    # [(year, clean_article_string)
    filter_articles = target_articles.filter(lambda year_article: any(
        keysentence in year_article[3] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    # Note: get_articles_list_matches ---> articles count
    # Note: get_sentences_list_matches ---> word_count

    matching_articles = filter_articles.map(lambda year_article: (
        year_article[0], year_article[1], year_article[2], year_article[3],
        get_articles_list_matches(year_article[3], keysentences)))

    matching_sentences = matching_articles.flatMap(
        lambda year_sentence: [(year_sentence[0], year_sentence[1], year_sentence[2], year_sentence[3], sentence)\
                                for sentence in year_sentence[4]])

    matching_data = matching_sentences.map(
        lambda sentence_data: (sentence_data[0], {
            "title": sentence_data[2].title_string,
            "article_id:": sentence_data[2].article_id,
            "authors:": sentence_data[2].authors_string,
            "page_ids": list(sentence_data[2].page_ids),
            "term": sentence_data[4],
            "preprocessed text": sentence_data[3],
            "issue_id": sentence_data[1].newspaper_id,
            "filename": sentence_data[1].filename
        }))

    # [(date, {"title": title, ...}), ...]
    # =>

    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Gets concordance for keywords and groups by date.

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <DATE>:
          [
            {
              "title": <TITLE>,
              "page_ids": <PAGE_IDS>,
              "content": <PAGE_CONTENT>,
              "word": <WORD>,
              "article_id": <ARTICLE_ID>,
              "issue_id": <ISSUE_ID>,
              "filename": <FILENAME>
            },
            ...
          ],
          <DATE>:
          ...
        }

    :param issues: RDD of defoe.alto.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [query_utils.preprocess_word(
                word, preprocess_type) for word in k_split]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    
    # [(date, issue, article, word), ...]
    filtered_words = issues.flatMap(
        lambda issue: get_article_matches(issue,
                                          keysentences,
                                          PreprocessWordType.NORMALIZE))

    # [(date, issue, article, word, article_clean), ...]
    # =>
    # [(date, {"title": title, ...}), ...]
    matching_docs = filtered_words.map(
        lambda date_issue_article_word:
        (date_issue_article_word[0],
         {"title": date_issue_article_word[2].title_string,
          "page_ids": list(date_issue_article_word[2].page_ids),
          "content": date_issue_article_word[4],
          "word": date_issue_article_word[3],
          "article_id": date_issue_article_word[2].article_id,
          "issue_id": date_issue_article_word[1].newspaper_id,
          "filename": date_issue_article_word[1].filename}))

    # [(date, {"title": title, ...}), ...]
    # =>
    # [(date, [{"title": title, ...], {...}), ...)]
    result = matching_docs \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
Beispiel #10
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    The query counts as a “hint” every time that finds a term from our lexicon
    and group the results by books.

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. 

         -  '''Twas on the morn of sweet May Day':
                - - neu
                     - 1
                - - blaw
                     - 5
     That means that neu appears in once of the book 'Twas on the morn of sweet May Day'. 
     And blaw appears 5 times in the same book. 

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped bytitle
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, document), ...]
    documents = archives.flatMap(lambda archive: [(
        document.title, document) for document in list(archive)])

    # [(year, page_string)

    clean_pages = documents.flatMap(lambda title_document: [(title_document[
        0], clean_page_as_string(page, defoe_path, os_type)) for page in
                                                            title_document[1]])
    pages = clean_pages.flatMap(lambda cl_page: [(cl_page[
        0], preprocess_clean_page(cl_page[1], preprocess_type))])
    # [(year, page_string)
    # [(year, page_string)
    filter_pages = pages.filter(lambda title_page: any(
        keysentence in title_page[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_pages = filter_pages.map(lambda title_page: (title_page[
        0], get_sentences_list_matches_per_page(title_page[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_pages.flatMap(lambda title_sentence: [(
        (title_sentence[0], sentence), 1) for sentence in title_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda titlesentence_count:
             (titlesentence_count[0][0],
              (titlesentence_count[0][1], titlesentence_count[1]))) \
        .groupByKey() \
        .map(lambda title_sentencecount:
             (title_sentencecount[0], list(title_sentencecount[1]))) \
        .collect()
    return result
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Select the articles text along with metadata by using a list of 
    keywords or keysentences and groups by year.

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.

    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. We can also configure how many target words 
    we want to use, and in which position the lexicon words starts. 

    Returns result of form:

        {
          <YEAR>:
          [
            [- article_id: 
             - authors:
             - filename:
             - issue_id:
             - page_ids:
             - text:
             - term
             - title ]
            ...
          ],
          <YEAR>:
          ...
        }


    :param issues: RDD of defoe.papers.issue.Issue
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)

    os_type = "sys-i386-64"
    if sys.platform == 'linux':
        os_type = "sys-i386-64"
    elif sys.platform == 'darwin':
        os_type = "sys-i386-snow-leopard"

    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    print(f'preprocessing: {preprocess_type}')

    unproc_keywords = config['keywords']
    keywords = []
    for k in unproc_keywords:
        keywords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in k.split()
        ]))
    print(f'keywords: {keywords}')

    # [(year, article_string), ...]
    clean_articles = issues.flatMap(
        lambda issue: [(issue.date.year, issue, article,
                        clean_article_as_string(article, defoe_path, os_type))
                       for article in issue.articles])

    # [(year, preprocess_article_string), ...]
    t_articles = clean_articles.flatMap(
        lambda cl_article: [(cl_article[0], cl_article[1], cl_article[
            2], preprocess_clean_article(cl_article[3], preprocess_type))])

    # [(year, clean_article_string)
    filter_articles = t_articles.filter(
        lambda year_article: any(k in year_article[3] for k in keywords))

    # [(year, [keysentence, keysentence]), ...]
    # Note: get_articles_list_matches ---> articles count
    # Note: get_sentences_list_matches ---> word_count

    matching_articles = filter_articles.map(
        lambda year_article: (year_article[0], year_article[1], year_article[
            2], get_articles_list_matches(year_article[3], keywords)))

    #   matching_sentences = matching_articles.flatMap(
    #       lambda year_sentence: [(year_sentence[0], year_sentence[1], year_sentence[2], sentence)\
    #                               for sentence in year_sentence[3]])

    matching_data = matching_articles.map(
        lambda sentence_data: (sentence_data[0], {
            "title": sentence_data[2].title_string,
            "article_id:": sentence_data[2].article_id,
            "authors:": sentence_data[2].authors_string,
            "page_ids": list(sentence_data[2].page_ids),
            "section": sentence_data[2].ct,
            "term": sentence_data[3],
            "original text": sentence_data[2].words_string,
            "issue_id": sentence_data[1].newspaper_id,
            "filename": sentence_data[1].filename
        }))

    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
Beispiel #12
0
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Select the articles text along with metadata by using a list of 
    keywords or keysentences and groups by year.

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.

    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. We can also configure how many target words 
    we want to use, and in which position the lexicon words starts.


    :param issues: RDD of defoe.papers.issue.Issue
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """

    print('Loading config')
    with open(config_file, "r") as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
    print(f'config: {config}')

    if sys.platform == "linux":
        os_type = "sys-i386-64"
    else:
        os_type = "sys-i386-snow-leopard"
    print(f'platform: {sys.platform}')

    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    print(f'preprocessing: {preprocess_type}')

    distance = int(config['distance'])

    unproc_keywords = config['keywords']
    keywords = []
    for k in unproc_keywords:
        keywords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in k.split()
        ]))
    print(f'keywords: {keywords}')

    unproc_controlwords = config['controlwords']
    controlwords = []
    for c in unproc_controlwords:
        controlwords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in c.split()
        ]))
    print(f'controlwords: {controlwords}')

    # [(year, article_string), ...]
    clean_articles = issues.flatMap(
        lambda issue: [(issue.date.year, issue, article,
                        clean_article_as_string(article, defoe_path, os_type))
                       for article in issue.articles])

    # [(year, preprocess_article_string), ...]
    t_articles = clean_articles.flatMap(
        lambda cl_article: [(cl_article[0], cl_article[1], cl_article[
            2], preprocess_clean_article(cl_article[3], preprocess_type))])

    # [(year, clean_article_string)]
    filter_articles = t_articles.filter(
        lambda year_article: any(k in year_article[3] for k in keywords))

    matching_articles = filter_articles.flatMap(
        lambda year_article: [(year_article[0], year_article[
            1], year_article[2], w) for w in within_distance(
                keywords, controlwords, year_article[3], distance)])

    matching_data = matching_articles.map(
        lambda sentence_data: (sentence_data[0], {
            "article_id": sentence_data[2].article_id,
            "page_ids": list(sentence_data[2].page_ids),
            "issue_id": sentence_data[1].newspaper_id,
            "filename": sentence_data[1].filename,
            "original text": sentence_data[2].words_string,
            "keywords": list(sentence_data[3][1]),
            "controls": list(sentence_data[3][0]),
        }))

    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
def do_query(df, config_file=None, logger=None, context=None):
    """
    Gets concordance using a window of words, for keywords and groups by date.

    Data in ES have the following colums:

    "title",  "edition", "year", "place", "archive_filename", 
    "source_text_filename", "text_unit", "text_unit_id", 
    "num_text_unit", "type_archive", "model", "source_text_raw", 
    "source_text_clean", "source_text_norm", "source_text_lemmatize", "source_text_stem",
    "num_words"

    config_file must be indicated with the list of words/sentences (lexicon) to search, and
    the preprocess treatment to select from ES. 


    Returns result of form:
          [(year, [(title, edition, archive_filename, filename, word,corcondance),
              (title, edition, archive_filename, filename, word, concordance ), ...]), ...]


    :param issues: RDD of defoe.alto.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    window = 10
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    preprocess_config = config["preprocess"]
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    
    # Filter out the pages that are null, which model is nls, and select only 2 columns: year and the page as string (either raw or preprocessed).
    if preprocess_config == "normalize":
        fdf = df.withColumn("source_text_norm", blank_as_null("source_text_norm"))
        newdf=fdf.filter(fdf.source_text_norm.isNotNull()).filter(fdf["model"]=="nls").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.source_text_norm)
    elif preprocess_config == "lemmatize":
        fdf = df.withColumn("source_text_lemmatize", blank_as_null("source_text_lemmatize"))
        newdf=fdf.filter(fdf.source_text_lemmatize.isNotNull()).filter(fdf["model"]=="nls").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.source_text_lemmatize)
    elif preprocess_config == "stem":
        fdf = df.withColumn("source_text_stem", blank_as_null("source_text_stem"))
        newdf=fdf.filter(fdf.source_text_stem.isNotNull()).filter(fdf["model"]=="nls").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.source_text_stem)
    else: 
        fdf = df.withColumn("source_text_clean", blank_as_null("source_text_clean"))
        newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.source_text_clean)


    pages=newdf.rdd.map(tuple)
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [query_utils.preprocess_word(
                word, preprocess_type) for word in k_split]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    

    filter_pages = pages.filter(
        lambda year_page: any( keysentence in year_page[5] for keysentence in keysentences))


    # [(year, title, edition, archive_filename, filename, text, [(word, idx), (word, idx) ...]), ...]
    maching_idx = filter_pages.map(
        lambda year_page: (
            (year_page[0],
             year_page[1],
             year_page[2],
             year_page[3],
             year_page[4],
             year_page[5],
             get_text_keyword_idx(year_page[5],
                                     keysentences))
        )
    )

    # [(year, [(title, edition, archive_filename, filename, word, [concordance, ...]), ...])]
    concordance_words = maching_idx.flatMap(
        lambda year_idx: [
            (year_idx[0],
             (year_idx[1], year_idx[2], year_idx[3], year_idx[4], word_idx[0],\
              get_concordance(year_idx[5], word_idx[0], word_idx[1], window)))\
              for word_idx in year_idx[6]])


    # [(year, [(title, edition, archive_filename, filename, word,corcondance),
    #          (title, edition, archive_filename, filename, word, concordance ), ...]), ...]
    result = concordance_words.groupByKey() \
        .map(lambda year_match:
             (year_match[0], list(year_match[1]))) \
        .collect()
    return result
def do_query(df, config_file=None, logger=None, context=None):
    """
    Gets concordance for keywords and groups by date.

    Data in ES have the following colums:

    "title",  "edition", "year", "place", "archive_filename", 
    "source_text_filename", "text_unit", "text_unit_id", 
    "num_text_unit", "type_archive", "model", "source_text_raw", 
    "source_text_clean", "source_text_norm", "source_text_lemmatize", "source_text_stem",
    "num_words"

    config_file must be used for indicated the list of words/sentences (lexicon) to search,
    and the preprocess treatment to select from ES. 


    Returns result of form:

        {
          <YEAR>:
          [
            {"title": 
            "edition": 
            "archive_name": 
            "filename": 
            "text": 
            "keysentence": 
            },
            ...
          ],
          <YEAR>:
          ...
        }

    :param issues: RDD of defoe.alto.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    preprocess_config = config["preprocess"]
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))

    # Filter out the pages that are null, which model is nls, and select only 2 columns: year and the page as string (either raw or preprocessed).
    if preprocess_config == "normalize":
        fdf = df.withColumn("source_text_norm",
                            blank_as_null("source_text_norm"))
        newdf = fdf.filter(fdf.source_text_norm.isNotNull()).filter(
            fdf["model"] == "nls").select(fdf.year, fdf.title, fdf.edition,
                                          fdf.archive_filename,
                                          fdf.source_text_filename,
                                          fdf.source_text_norm)
    elif preprocess_config == "lemmatize":
        fdf = df.withColumn("source_text_lemmatize",
                            blank_as_null("source_text_lemmatize"))
        newdf = fdf.filter(fdf.source_text_lemmatize.isNotNull()).filter(
            fdf["model"] == "nls").select(fdf.year, fdf.title, fdf.edition,
                                          fdf.archive_filename,
                                          fdf.source_text_filename,
                                          fdf.source_text_lemmatize)
    elif preprocess_config == "stem":
        fdf = df.withColumn("source_text_stem",
                            blank_as_null("source_text_stem"))
        newdf = fdf.filter(fdf.source_text_stem.isNotNull()).filter(
            fdf["model"] == "nls").select(fdf.year, fdf.title, fdf.edition,
                                          fdf.archive_filename,
                                          fdf.source_text_filename,
                                          fdf.source_text_stem)
    else:
        fdf = df.withColumn("source_text_clean",
                            blank_as_null("source_text_clean"))
        newdf = fdf.filter(fdf.source_text_clean.isNotNull()).filter(
            fdf["model"] == "nls").select(fdf.year, fdf.title, fdf.edition,
                                          fdf.archive_filename,
                                          fdf.source_text_filename,
                                          fdf.source_text_clean)

    pages = newdf.rdd.map(tuple)
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)

    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[5] for keysentence in keysentences))

    # [(year, title, edition, archive, filename, text, keysentence), ...]
    # We also need to convert the string as an integer spliting first the '.
    matching_pages = filter_pages.map(lambda year_page: (
        year_page[0],
        get_pages_matches_no_prep(year_page[1], year_page[2], year_page[3],
                                  year_page[4], year_page[5], keysentences)))

    # [[(year, [title, edition, archive, filename, text, keysentence]), (year, [title, edition, archive, filename, text, keysentence]) ..]

    matching_sentences = matching_pages.flatMap(
        lambda year_sentence: [(year_sentence[0], data_sentence)
                               for data_sentence in year_sentence[1]])

    matching_docs = matching_sentences.map(lambda date_page:
                                           (date_page[0], {
                                               "title": date_page[1][0],
                                               "edition": date_page[1][1],
                                               "archive_name": date_page[1][2],
                                               "filename": date_page[1][3],
                                               "text": date_page[1][4],
                                               "keysentence": date_page[1][5]
                                           }))

    # [(date, {"title": title, ...}), ...]
    # =>
    # [(date, [{"title": title, ...], {...}), ...)]
    result = matching_docs \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
Beispiel #15
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Counts number of articles in which they are occurences of
    keysentences and groups them by year.

    Words in articles and keysentences can be normalized, normalized
    and stemmed, or normalized and lemmatized (default).

    config_file must be the path to a configuration file of form:

        preprocess: none|normalize|stem|lemmatize # Optional
        data: <DATA_FILE>

    <DATA_FILE> must be the path to a plain-text data file with a list
    of the keysentences to search for, one per line. If <DATA_FILE> is
    a relative path then it is assumed to be relative to the directory
    in which config_file resides.

    Returns result of form:

        {
            <YEAR>:
            [
                [<SENTENCE>, <NUM_ARTICLES>],
                [<SENTENCE>, <NUM_ARTICLES>],
                ...
            ],
            <YEAR>:
            ...
        }

    :param issues: RDD of defoe.papers.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keysentences grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, article_string)
    articles = issues.flatMap(lambda issue: [(
        issue.date.year, get_article_as_string(article, preprocess_type))
                                             for article in issue.articles])

    # [(year, article_string)
    filter_articles = articles.filter(lambda year_article: any(
        keysentence in year_article[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_articles = filter_articles.map(lambda year_article: (year_article[
        0], get_sentences_list_matches(year_article[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_articles.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Counts number of articles containing both a target word and one or
    more keywords and groups by year.

    Words in articles, target words and keywords can be normalized,
    normalized and stemmed, or normalized and lemmatized (default).

    config_file must be the path to a configuration file of form:

        preprocess: none|normalize|stem|lemmatize # Optional
        data: <DATA_FILE>

    <DATA_FILE> must be the path to a plain-text data file with a list
    of keywords to search for, one per line. The first word is assumed
    to be the target word. If <DATA_FILE> is a relative path then it
    is assumed to be relative to the directory in which config_file
    resides.

    Returns result of form:

        {
          <YEAR>:
          [
            {
              "target_word": <WORD>,
              "words": [<WORD>, <WORD>, ...],
              "count": <COUNT>
            },
            ...
          ],
          <YEAR>:
          ...
        }

    :param issues: RDD of defoe.papers.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keywords = []
    with open(data_file, 'r') as f:
        keywords = [
            query_utils.preprocess_word(word, preprocess_type)
            for word in list(f)
        ]

    target_word = keywords[0]
    keywords = keywords[1:]

    # [(year, article), ...]
    articles = issues.flatMap(lambda issue: [(issue.date.year, article)
                                             for article in issue.articles])
    # [(year, article), ...]
    target_articles = articles.filter(
        lambda year_article: article_contains_word(year_article[
            1], target_word, preprocess_type))
    # [((year, [word, word, ...]), 1), ...]
    words = target_articles.map(lambda year_article: (
        (year_article[0],
         get_article_keywords(year_article[1], keywords, preprocess_type)), 1))
    # [((year, [word, word, ...]), 1), ...]
    match_words = words.filter(
        lambda yearword_count: len(yearword_count[0][1]) > 0)
    # [((year, "target_word, word, word, ..."), 1), ...]
    # Convert word list to string so can serve as a key.
    multi_words = match_words.map(lambda yearword_count: ((yearword_count[0][
        0], ",".join(yearword_count[0][1])), yearword_count[1]))
    # [((year, "word, word, ..."), 1), ...]
    # =>
    # [((year, "word, word, ..."), count), ...]
    # =>
    # [((year, ("word, word, ...", count)), ...]
    # =>
    # [((year, [{"words": "word, word, ...",
    #            "count": count}, ...],
    #          ...]
    # list of words is restored from string of words.
    result = multi_words \
        .reduceByKey(add) \
        .map(lambda yearword_count:
             (yearword_count[0][0],
              (yearword_count[0][1],
               yearword_count[1]))) \
        .groupByKey() \
        .map(lambda year_wordcount:
             (year_wordcount[0],
              word_article_count_list_to_dict(target_word,
                                              year_wordcount[1])))\
        .collect()
    return result
def do_query(archives, config_file=None, logger=None, context=None):
    """
    The query counts as a “hit” every time that finds a page with a particular 
    term from a lexicon and it groups the results by years.

    The config_file must indicate the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. 

    If a term appears several times in the same page, it will be still counted as “1”.
            Example:
            1795:
            - - kail
              - 1
            - - aff
              - 4
            - - lairds
              - 1
    That means that kail appears in 1 page , aff in 4 pages and lairds in 1 page across all the books in the year 1795.

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type= "sys-i386-snow-leopard"
    else:
            os_type = "sys-i386-64"
    if "defoe_path" in config :
        defoe_path= config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config, os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [query_utils.preprocess_word(
                word, preprocess_type) for word in k_split]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, document), ...]
    documents = archives.flatMap(
        lambda archive: [(document.year, document) for document in list(archive)])
    # [(year, page_string)
    
    
    clean_pages = documents.flatMap(
        lambda year_document: [(year_document[0],  
                                    clean_page_as_string(page, defoe_path, os_type)) 
                                       for page in year_document[1]])
    pages = clean_pages.flatMap(
        lambda cl_page: [(cl_page[0], 
                                    preprocess_clean_page(cl_page[1], preprocess_type))]) 
    # [(year, page_string)
    # [(year, page_string)
    filter_pages = pages.filter(
        lambda year_page: any(
            keysentence in year_page[1] for keysentence in keysentences))
    
    
    # [(year, [keysentence, keysentence]), ...]
    matching_pages = filter_pages.map(
        lambda year_page: (year_page[0],
                              get_sentences_list_matches(
                                  year_page[1],
                                  keysentences)))
    

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_pages.flatMap(
        lambda year_sentence: [((year_sentence[0], sentence), 1)
                               for sentence in year_sentence[1]])


    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
Beispiel #18
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Counts number of occurrences of keywords or keysentences and groups by words.

    The config_file must indicate the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system.

    Returns result of form:

        {
          <WORD>:
          [
            [<YEAR>, <NUM_WORDS>],
            ...
          ],
          <WORD>:
          ...

        }

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, document), ...]
    documents = archives.flatMap(lambda archive: [(document.year, document) for
                                                  document in list(archive)])
    # [(year, page_string)

    clean_pages = documents.flatMap(lambda year_document: [(year_document[
        0], clean_page_as_string(page, defoe_path, os_type)) for page in
                                                           year_document[1]])
    pages = clean_pages.flatMap(lambda cl_page: [(cl_page[
        0], preprocess_clean_page(cl_page[1], preprocess_type))])
    # [(year, page_string)
    # [(year, page_string)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_pages = filter_pages.map(lambda year_page: (year_page[
        0], get_sentences_list_matches(year_page[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_pages.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(keysentence, (year, num_keysentences)), ...]
    # =>
    # [(keysentence, [year, num_keysentences]), ...]
    result = matching_sentences \
        .reduceByKey(add) \
        .map(lambda yearsentence_count:
             (yearsentence_count[0][1],
              (yearsentence_count[0][0], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
Beispiel #19
0
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Gets concordance and collocation for keywords occurring in
    articles which have a target word and groups the results by date.
    Words in articles, target words and keywords can be normalized,
    normalized and stemmed, or normalized and lemmatized (default).
    A window size (default 10) determines the size of the concordance
    returned.
    config_file must be the path to a configuration file of form:

        preprocess: none|normalize|stem|lemmatize # Optional
        window: <WINDOW_SIZE> # Optional
        data: <DATA_FILE>

    <DATA_FILE> must be the path to a plain-text data file with a list
    of keywords to search for, one per line. The first word is assumed
    to be the target word. If <DATA_FILE> is a relative path then it
    is assumed to be relative to the directory in which config_file
    resides.

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line. The first word
    is assumed to be the target word.
    Returns result of form:

        {
            <YEAR>:
            [
                [<FILENAME>, <WORD>, <CONCORDANCE>, <OCR>],
                [<FILENAME>, <WORD>, <CONCORDANCE>, <OCR>],
                ...
            ],
            <YEAR>:
            ...
        }

    :param issues: RDD of defoe.alto.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    window = query_utils.extract_window_size(config)
    keywords = []
    with open(data_file, "r") as f:
        keywords = [query_utils.preprocess_word(
            word, preprocess_type) for word in list(f)]
    target_word = keywords[0]

    # [(year, article, filename, ocr), ...]
    articles = issues.flatMap(
        lambda issue: [(issue.date.year,
                        article,
                        issue.filename,
                        article.quality)
                       for article in issue.articles])
    # [(year, article, filename, ocr), ...]
    target_articles = articles.filter(
        lambda year_article_file_ocr: article_contains_word(
            year_article_file_ocr[1], target_word, preprocess_type))
    # [(year, article, filename, [(word, idx), (word, idx) ...], ocr), ...]
    matching_idx = target_articles.map(
        lambda year_article_file_ocr: (
            (year_article_file_ocr[0],
             year_article_file_ocr[1],
             year_article_file_ocr[2],
             get_article_keyword_idx(year_article_file_ocr[1],
                                     keywords,
                                     preprocess_type),
             year_article_file_ocr[3])
        )
    )
    # [(year, [(filename, word, [concordance, ...], ocr), ...])]
    concordance_words = matching_idx.flatMap(
        lambda year_article_file_matches_ocr: [
            (year_article_file_matches_ocr[0],
             (year_article_file_matches_ocr[2],
              word_idx[0],
              get_concordance(year_article_file_matches_ocr[1],
                              word_idx[0],
                              word_idx[1],
                              window,
                              preprocess_type),
              year_article_file_matches_ocr[4]))
            for word_idx in year_article_file_matches_ocr[3]])

    # [(year, [(filename, word, corcondance, ocr),
    #          (filename, word, concordance, ocr), ...]), ...]
    result = concordance_words.groupByKey() \
        .map(lambda year_match:
             (year_match[0], list(year_match[1]))) \
        .collect()
    return result
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Counts number of occurrences of keywords or keysentences and groups by year.

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords/keysentences and words in articles are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <YEAR>:
          [
            [<SENTENCE|WORD>, <NUM_SENTENCES|WORDS>],
            ...
          ],
          <YEAR>:
          ...
        }

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, article_string), ...]
    clean_articles = issues.flatMap(
        lambda issue: [(issue.date.year, clean_article_as_string(article))
                       for article in issue.articles])

    # [(year, preprocess_article_string), ...]
    t_articles = clean_articles.flatMap(lambda cl_article: [(cl_article[
        0], preprocess_clean_article(cl_article[1], preprocess_type))])
    # [(year, clean_article_string)
    filter_articles = t_articles.filter(lambda year_article: any(
        keysentence in year_article[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_articles = filter_articles.map(lambda year_article: (year_article[
        0], get_sentences_list_matches(year_article[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_articles.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
Beispiel #21
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Crops articles' images for keywords and groups by word.

    Config_file must a yml file that has the following values:
        * preprocess: Treatment to use for preprocessing the words. Options: [normalize|stem|lemmatize|none]
        * data: yaml file with a list of the target words and a list of keywords to search for.
                This should be in the same path at the configuration file.
        * years_filter: Min and Max years to filter the data. Separeted by "-"
        * output_path: The path to store the cropped images.

    Returns result of form:

        {
          <WORD>:
          [
            { "article_id": <ARTICLE ID>,
              "issue_filename": <ISSUE.ZIP>, 
              "issue_id": <ISSUE ID>
              "coord": <COORDENATES>,
              "cropped_image": <IMAGE.JPG> 
              "page_area": <PAGE AREA>,
              "page_filename": < PAGE FILENAME>,
              "place": <PLACE>,
              "textblock_id": <TEXTBLOCK ID>,
              "title": <TITLER>,
              "words": <WORDS>,
              "preprocessed_words": <PREPROCESSED WORDS> 
              "year": <YEAR>,
              "date": <DATE>,
              "distance": <DISTANCE BETWEEN TARGET AND KEYWORD>,
              "total_words": <NUMBER OF WORDS IN TEXTBLOCK>
            },
            ...
          ],
          <WORD>:
          ...
        }
    :param archives: RDD of defoe.fmp.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by word
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    year_min, year_max = query_utils.extract_years_filter(config)
    output_path = query_utils.extract_output_path(config)
    keywords = []
    with open(data_file, 'r') as f:
        input_words = yaml.load(f)

    target_words = set([
        query_utils.preprocess_word(word, preprocess_type)
        for word in input_words['targets']
    ])
    keywords = set([
        query_utils.preprocess_word(word, preprocess_type)
        for word in input_words['keywords']
    ])

    # retrieve the documents from each archive
    documents = archives.flatMap(lambda archive: [
        document for document in archive
        if int(year_min) <= document.year <= int(year_max)
    ])

    # find textblocks that contain pairs of (target word, keyword) and record their distance
    filtered_words = documents.flatMap(lambda document: find_words(
        document, target_words, keywords, preprocess_type))

    # create the output dictionary
    # mapping from
    #   [MatchedWords(target_word, keyword, textblock_location, distance, words, preprocessed)]
    # to
    #   [(word, {"article_id": article_id, ...}), ...]
    matching_docs = filtered_words.map(lambda matched: (
        matched.keyword,
        {
            "title":
            matched.textblock.document.title,
            "place":
            matched.textblock.document.place,
            "article_id":
            matched.textblock.article,
            "textblock_id":
            matched.textblock.textblock_id,
            "coord":
            matched.textblock.textblock_coords,
            "page_area":
            matched.textblock.textblock_page_area,
            "year":
            matched.textblock.year,
            "date":
            matched.textblock.document.date,
            # "words": matched.words,
            # "preprocessed_words":  matched.preprocessed,
            "page_filename":
            matched.textblock.textblock_page_name,
            "issue_id":
            matched.textblock.document.documentId,
            "issue_dirname":
            matched.textblock.document.archive.filename,
            "target_word":
            matched.target_word,
            "distance":
            matched.distance,
            "cropped_image":
            segment_image(matched.textblock.textblock_coords, matched.textblock
                          .textblock_page_name, matched.textblock.document.
                          archive.filename, matched.keyword, output_path,
                          matched.target_word)
        }))

    # group by the matched keywords and collect all the articles by keyword
    # [(word, {"article_id": article_id, ...}), ...]
    # =>
    # [(word, [{"article_id": article_id, ...], {...}), ...)]
    # sorted by distance between target and keyword
    result = matching_docs \
        .groupByKey() \
        .map(lambda word_context:
             (word_context[0], sorted(list(word_context[1]), key=lambda d: d['distance']))) \
        .collect()
    return result
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Gets concordance using a window of words, for keywords and groups by date.

    Data in ES have the following colums:

    "title",  "edition", "year", "place", "archive_filename", 
    "source_text_filename", "text_unit", "text_unit_id", 
    "num_text_unit", "type_archive", "model", "source_text_raw", 
    "source_text_clean", "source_text_norm", "source_text_lemmatize", "source_text_stem",
    "num_words"

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:
          [(year, [(title, edition, archive_filename, filename, word,corcondance),
              (title, edition, archive_filename, filename, word, concordance ), ...]), ...]


    :param issues: RDD of defoe.alto.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    window = 20
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    preprocess_config = config["preprocess"]
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))

    #newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.source_text_clean)

    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)

    # [(year, document, title, edition, archive_filename), ...]
    documents = archives.flatMap(lambda archive: [(
        document.year, document, document.title, document.edition, document.
        archive.filename) for document in list(archive)])
    # [(year, page_string)

    #(year, title, edition, archive_filename, page_code, clean_page_string)
    clean_pages = documents.flatMap(lambda year_document: [(year_document[
        0], year_document[2], year_document[3], year_document[
            4], page.code, clean_page_as_string(
                page, defoe_path, os_type)) for page in year_document[1]])

    #(year, title, edition, archive_filename, page_code, preprocess_clean_page)
    pages = clean_pages.flatMap(lambda cl_page: [(
        cl_page[0], cl_page[1], cl_page[2], cl_page[3], cl_page[4],
        preprocess_clean_page(cl_page[5], preprocess_type))])
    #(year, title, edition, archive_filename, page_code, preprocess_clean_page)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[5] for keysentence in keysentences))

    # [(year, title, edition, archive_filename, filename, text, [(word, idx), (word, idx) ...]), ...]
    maching_idx = filter_pages.map(lambda year_page: (
        (year_page[0], year_page[1], year_page[2], year_page[3], year_page[4],
         year_page[5], get_text_keyword_idx(year_page[5], keysentences))))

    # [(year, [(title, edition, archive_filename, filename, word, [concordance, ...]), ...])]
    concordance_words = maching_idx.flatMap(
        lambda year_idx: [
            (year_idx[0],
                {"title":year_idx[1], "edition": year_idx[2], "archive_filename": year_idx[3], "filename":year_idx[4], "term": word_idx[0],\
                 "snippet": get_concordance_string(year_idx[5], word_idx[0], word_idx[1], window)})\
                 for word_idx in year_idx[6]])

    result = concordance_words.groupByKey() \
        .map(lambda year_match:
             (year_match[0], list(year_match[1]))) \
        .collect()
    return result
Beispiel #23
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Counts number of occurrences of keywords and groups by year.
    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.
    Both keywords and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.
    Returns result of form:
        {
          <YEAR>:
          [
            [<WORD>, <NUM_WORDS>],
            ...
          ],
          <YEAR>:
          ...
        }
    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keywords = []
    with open(data_file, 'r') as f:
        keywords = [
            query_utils.preprocess_word(word, preprocess_type)
            for word in list(f)
        ]
    # [(year, document), ...]
    documents = archives.flatMap(lambda archive: [(document.year, document) for
                                                  document in list(archive)])
    # [((year, word), 1), ...]
    words = documents.flatMap(lambda year_document: [(
        (year_document[0], query_utils.preprocess_word(word, preprocess_type)
         ), 1) for page in year_document[1] for word in page.words])
    # [((year, word), 1), ...]
    matching_words = words.filter(
        lambda yearword_count: yearword_count[0][1] in keywords)
    # [((year, word), num_words), ...]
    # =>
    # [(year, (word, num_words)), ...]
    # =>
    # [(year, [word, num_words]), ...]
    result = matching_words \
        .reduceByKey(add) \
        .map(lambda yearword_count:
             (yearword_count[0][0],
              (yearword_count[0][1], yearword_count[1]))) \
        .groupByKey() \
        .map(lambda year_wordcount:
             (year_wordcount[0], list(year_wordcount[1]))) \
        .collect()
    return result
Beispiel #24
0
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Counts number of times that each keyword appears for every article
    that has a target word in it.

    Words in articles, target words and keywords can be normalized,
    normalized and stemmed, or normalized and lemmatized (default).

    config_file must be the path to a configuration file of form:

        preprocess: none|normalize|stem|lemmatize # Optional
        data: <DATA_FILE>

    <DATA_FILE> must be the path to a plain-text data file with a list
    of keywords to search for, one per line. The first word is assumed
    to be the target word. If <DATA_FILE> is a relative path then it
    is assumed to be relative to the directory in which config_file
    resides.

    Returns result of form:

        {
            <YEAR>:
            [
                [<WORD>, <NUM_WORDS>],
                [<WORD>, <NUM_WORDS>],
                ...
            ],
            <YEAR>:
            ...
        }

    :param issues: RDD of defoe.papers.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keywords = []
    with open(data_file, 'r') as f:
        keywords = [
            query_utils.preprocess_word(word, preprocess_type)
            for word in list(f)
        ]

    target_word = keywords[0]
    # [(year, article), ...]
    articles = issues.flatMap(lambda issue: [(issue.date.year, article)
                                             for article in issue.articles])
    # [(year, article), ...]
    target_articles = articles.filter(
        lambda year_article: article_contains_word(year_article[
            1], target_word, preprocess_type))

    # [((year, word), 1), ...]
    words = target_articles.flatMap(lambda target_article: [(
        (target_article[0], query_utils.preprocess_word(word, preprocess_type)
         ), 1) for word in target_article[1].words])

    # [((year, word), 1), ...]
    matching_words = words.filter(
        lambda yearword_count: yearword_count[0][1] in keywords)
    # [((year, word), num_words), ...]
    # =>
    # [(year, (word, num_words)), ...]
    # =>
    # [(year, [word, num_words]), ...]
    result = matching_words \
        .reduceByKey(add) \
        .map(lambda yearword_count:
             (yearword_count[0][0],
              (yearword_count[0][1], yearword_count[1]))) \
        .groupByKey() \
        .map(lambda year_wordcount:
             (year_wordcount[0], list(year_wordcount[1]))) \
        .collect()
    return result
def do_query(df, config_file=None, logger=None, context=None):
    """
    Read from HDFS, and counts number of occurrences of keywords or keysentences and groups by year.
    We have an entry in the HFDS file with the following information: 
    
    "title",  "edition", "year", "place", "archive_filename",  "source_text_filename", 
    "text_unit", "text_unit_id", "num_text_unit", "type_archive", "model", "source_text_raw", 
    "source_text_clean", "source_text_norm", "source_text_lemmatize", "source_text_stem", "num_words"

    config_filep 

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords/keysentences and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <YEAR>:
          [
            [<SENTENCE|WORD>, <NUM_SENTENCES|WORDS>],
            ...
          ],
          <YEAR>:
          ...
        }

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """

    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_config = config["preprocess"]
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))

    # Filter out the pages that are null, which model is nls, and select only 2 columns: year and the page as string (either raw or preprocessed).
    if preprocess_config == "normalize":
        fdf = df.withColumn("source_text_norm",
                            blank_as_null("source_text_norm"))
        newdf = fdf.filter(fdf.source_text_norm.isNotNull()).filter(
            fdf["model"] == "nls").select(fdf.year, fdf.source_text_norm)
    elif preprocess_config == "lemmatize":
        fdf = df.withColumn("source_text_lemmatize",
                            blank_as_null("source_text_lemmatize"))
        newdf = fdf.filter(fdf.source_text_lemmatize.isNotNull()).filter(
            fdf["model"] == "nls").select(fdf.year, fdf.source_text_lemmatize)
    elif preprocess_config == "stem":
        fdf = df.withColumn("source_text_stem",
                            blank_as_null("source_text_stem"))
        newdf = fdf.filter(fdf.source_text_stem.isNotNull()).filter(
            fdf["model"] == "nls").select(fdf.year, fdf.source_text_stem)
    else:
        fdf = df.withColumn("source_text_clean",
                            blank_as_null("source_text_clean"))
        newdf = fdf.filter(fdf.source_text_clean.isNotNull()).filter(
            fdf["model"] == "nls").select(fdf.year, fdf.source_text_clean)

    pages = newdf.rdd.map(tuple)
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)

    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    # We also need to convert the string as an integer spliting first the '.
    matching_pages = filter_pages.map(lambda year_page: (year_page[
        0], get_sentences_list_matches(year_page[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_pages.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
def do_query(issues, config_file=None, logger=None, context=None):

    print('Loading config')
    with open(config_file, "r") as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
    print(f'config: {config}')

    if sys.platform == "linux":
        os_type = "sys-i386-64"
    else:
        os_type = "sys-i386-snow-leopard"
    print(f'platform: {sys.platform}')

    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    print(f'preprocessing: {preprocess_type}')

    unproc_keywords = config['keywords']
    keywords = []
    for k in unproc_keywords:
        keywords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in k.split()
        ]))
    print(f'keywords: {keywords}')

    unproc_targetwords = config['targetwords']
    targetwords = []
    for t in unproc_targetwords:
        targetwords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in t.split()
        ]))
    print(f'targetwords: {targetwords}')

    # [(year, article_string), ...]
    clean_articles = issues.flatMap(
        lambda issue: [(issue.date.year, issue, article,
                        clean_article_as_string(article, defoe_path, os_type))
                       for article in issue.articles])

    # [(year, preprocess_article_string), ...]
    preprocessed_articles = clean_articles.flatMap(
        lambda cl_article: [(cl_article[0], cl_article[1], cl_article[
            2], preprocess_clean_article(cl_article[3], preprocess_type))])

    # [(year, clean_article_string)
    filter_articles = preprocessed_articles.filter(
        lambda year_article: any(t in year_article[3] for t in targetwords))

    # [(year, [keysentence, keysentence]), ...]
    matching_articles = filter_articles.flatMap(
        lambda year_article: [(year_article[0], year_article[1], year_article[
            2], k) for k in find_matches(year_article[3], keywords)])

    matching_data = matching_articles.map(
        lambda sentence_data: (sentence_data[0], {
            "title": sentence_data[2].title_string,
            "article_id": sentence_data[2].article_id,
            "page_ids": list(sentence_data[2].page_ids),
            "section": sentence_data[2].ct,
            "keyword": sentence_data[3],
            "targets": list(targetwords),
            "issue_id": sentence_data[1].newspaper_id,
            "filename": sentence_data[1].filename
        }))

    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
Beispiel #27
0
def do_query(df, config_file=None, logger=None, context=None):
    """
    Gets concordance using a window of words, for keywords and groups by date.

    Data in HDFS have the following colums:
    
    "title",  "edition", "year", "place", "archive_filename",  "source_text_filename", "text_unit", 
    "text_unit_id", "num_text_unit", "type_archive", "model", "type_page", "header", "term", "definition",
    "num_articles", "num_page_words", "num_article_words", 

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. 

      Returns result of form:
        {
          <YEAR>:
          [
            [- title: 
             - edition:
             - archive_filename:
             - page number:
             - header:
             - term:
             - article:
             - article-definition: ], 
             [], 
            ...
         
          <YEAR>:
          ...
        }
  
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    # Filter out the pages that are null, which model is nls, and select only 2 columns: year and the page as string (either raw or preprocessed).
    fdf = df.withColumn("definition", blank_as_null("definition"))
    #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, term, article_text)
    newdf = fdf.filter(fdf.definition.isNotNull()).filter(
        fdf["model"] == "nlsArticles").select(fdf.year, fdf.title, fdf.edition,
                                              fdf.archive_filename,
                                              fdf.source_text_filename,
                                              fdf.text_unit_id, fdf.type_page,
                                              fdf.header, fdf.term,
                                              fdf.definition)
    articles = newdf.rdd.map(tuple)

    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)

    #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, article, preprocess_article, clean_article)

    preprocess_articles = articles.flatMap(lambda t_articles: [(
        t_articles[0], t_articles[1], t_articles[2], t_articles[3], t_articles[
            4], t_articles[5], t_articles[6], t_articles[7], t_articles[8],
        preprocess_clean_page(t_articles[9], preprocess_type), t_articles[9])])

    filter_articles = preprocess_articles.filter(lambda year_page: any(
        keysentence in year_page[9] for keysentence in keysentences))

    #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, term, article_text, list_sentences)
    matching_articles = filter_articles.map(lambda year_article: (
        year_article[0], year_article[1], year_article[2], year_article[
            3], year_article[4], year_article[5], year_article[
                6], year_article[7], year_article[8], year_article[10],
        get_articles_list_matches(year_article[9], keysentences)))

    #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, term, article_text, sentence)
    matching_sentences = matching_articles.flatMap(
        lambda year_sentence: [(year_sentence[0], year_sentence[1], year_sentence[2], year_sentence[3],
                                year_sentence[4], year_sentence[5], year_sentence[6], year_sentence[7],
                                year_sentence[8], year_sentence[9], sentence)\
                                for sentence in year_sentence[10]])

    matching_data = matching_sentences.map(
        lambda sentence_data: (sentence_data[0], {
            "title": sentence_data[1],
            "edition": sentence_data[2],
            "archive_filename": sentence_data[3],
            "filename": sentence_data[4],
            "page number": sentence_data[5],
            "type_page": sentence_data[6],
            "header": sentence_data[7],
            "term": sentence_data[10],
            "article": sentence_data[8],
            "article-definition": sentence_data[9]
        }))

    # [(date, {"title": title, ...}), ...]
    # =>

    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
Beispiel #28
0
def do_query(issues, config_file=None, logger=None, context=None):
    print('Loading config')
    with open(config_file, "r") as f:
        config = yaml.load(f)
    print(f'config: {config}')

    if sys.platform == "linux":
        os_type = "sys-i386-64"
    else:
        os_type = "sys-i386-snow-leopard"
    print(f'platform: {sys.platform}')

    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    print(f'preprocessing: {preprocess_type}')

    unproc_keywords = config['keywords']
    keywords = []
    for k in unproc_keywords:
        keywords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in k.split()
        ]))
    print(f'keywords: {keywords}')

    clean_articles = issues.flatMap(lambda issue: [(
        issue.date.year, clean_article_as_string(
            article, defoe_path, os_type)) for article in issue.articles])

    preprocessed_articles = clean_articles.map(lambda cl_article: (cl_article[
        0], preprocess_clean_article(cl_article[1], preprocess_type)))

    # [(year, article_string)
    filter_articles = preprocessed_articles.filter(
        lambda year_article: any(k in year_article[1] for k in keywords))

    # [(year, [keysentence, keysentence]), ...]
    # Note: get_articles_list_matches ---> articles count
    # Note: get_sentences_list_matches ---> word_count
    matching_articles = filter_articles.map(lambda year_article: (year_article[
        0], get_sentences_list_matches(year_article[1], keywords)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_articles.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda a: (a[0], [{x[0]: x[1]} for x in a[1]])) \
        .collect()
    return result
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Gets the concordance (also called details) occurrences of keywords or keysentences and groups by year.

    The config_file must indicate the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system.

    Returns result of form:

        {
          <YEAR>:
          [
            - [title:
             place:
             publisher:
             snippet:
             term:
             document_id:
             filenanme]
            - []
            ...
          ],
          <YEAR>:
          ...
        }
        
       

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, document), ...]
    documents = archives.flatMap(lambda archive: [(document.year, document) for
                                                  document in list(archive)])
    # [(year, page_string)

    clean_pages = documents.flatMap(
        lambda year_document: [(year_document[0], year_document[
            1], page, clean_page_as_string(page, defoe_path, os_type))
                               for page in year_document[1]])
    pages = clean_pages.flatMap(lambda cl_page: [(cl_page[0], cl_page[
        1], cl_page[2], preprocess_clean_page(cl_page[3], preprocess_type))])
    # [(year, page_string)
    # [(year, page_string)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_pages = filter_pages.map(
        lambda year_page: (year_page[0], year_page[1], year_page[
            2], get_sentences_list_matches(year_page[3], keysentences)))

    matching_sentences = matching_pages.flatMap(
        lambda year_sentence: [(year_sentence[0], year_sentence[1], year_sentence[2], sentence)\
                                for sentence in year_sentence[3]])

    matching_data = matching_sentences.map(
        lambda page_data: (page_data[0], {
            "title": page_data[1].title,
            "place": page_data[1].place,
            "publisher": page_data[1].publisher,
            "page_number": page_data[2].code,
            "snippet": page_data[2].content,
            "term": page_data[3],
            "document_id": page_data[1].code,
            "filename": page_data[1].archive.filename
        }))


    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result