def calculate_words_within_dictionary(
        page, preprocess_type=PreprocessWordType.NORMALIZE):
    """
    Calculates the % of page words within a dictionary and also returns the page quality (pc)
    Page words are normalized. 
    :param page: Page
    :type page: defoe.nls.page.Page
    :param preprocess_type: how words should be preprocessed
    (normalize, normalize and stem, normalize and lemmatize, none)
    :return: matches
    :rtype: list(str or unicode)
    """
    dictionary = words.words()
    counter = 0
    total_words = 0
    for word in page.words:
        preprocessed_word = query_utils.preprocess_word(word, preprocess_type)
        if preprocessed_word != "":
            total_words += 1
            if preprocessed_word in dictionary:
                counter += 1
    try:
        calculate_pc = str(counter * 100 / total_words)
    except:
        calculate_pc = "0"
    return calculate_pc
Exemple #2
0
def do_query(hansards, config_file=None, logger=None, context=None):

    with open(config_file, "r") as f:
        config = yaml.load(f)

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    print(f'preprocessing: {preprocess_type}')

    unproc_keywords = config['keywords']
    keywords = []
    for k in unproc_keywords:
        keywords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in k.split()
        ]))
    print(f'keywords: {keywords}')

    # [(year, discussion_string), ...]
    headings = hansards.flatMap(
        lambda hansard: [(hansard, h) for h in get_headings(hansard)])

    # [(discussion, clean_text), ...]
    discussion_text = headings.map(
        lambda hansard_discussion: (hansard_discussion[0], hansard_discussion[
            1], clean_text(hansard_discussion[1], preprocess_type)))

    # [(discussion, clean_text)
    filter_discussions = discussion_text.filter(
        lambda disc: any(k in disc[2] for k in keywords))

    # [(discussion, clean_text)
    matching_discussions = filter_discussions.map(
        lambda disc: (disc[0], disc[1], find_matches(disc[2], keywords)))

    matching_data = matching_discussions.flatMap(
        lambda discussion: [(discussion[1]._id, {
            "title":
            discussion[1].title,
            "heading_id":
            discussion[1]._id,
            "speech_id":
            speech._id,
            "speaker": ((speech.speaker.id, speech.speaker.name)
                        if speech.speaker is not None else ''),
            "text":
            speech.text,
            "filename":
            discussion[0].filename,
            "term":
            list(discussion[2]),
        }) for speech in discussion[1].speeches])

    result = matching_data \
        .groupByKey() \
        .map(lambda speech:
             (speech[0], list(speech[1]))) \
        .collect()
    return result
Exemple #3
0
def find_words(document,
               target_words,
               keywords,
               preprocess_type=query_utils.PreprocessWordType.LEMMATIZE):
    """
    If a keyword occurs more than once on a page, there will be only
    one tuple for the page for that keyword.
    If more than one keyword occurs on a page, there will be one tuple
    per keyword.
    The distance between keyword and target word is recorded in the output tuple.
    :param document: document
    :type document: defoe.alto.document.Document
    :param keywords: keywords
    :type keywords: list(str or unicode:
    :param preprocess_type: how words should be preprocessed
    (normalize, normalize and stem, normalize and lemmatize, none)
    :type preprocess_type: defoe.query_utils.PreprocessWordType
    :return: list of tuples
    :rtype: list(tuple)
    """
    matches = []
    document_articles = document.articles
    for article in document_articles:
        for tb in document_articles[article]:
            keys = defaultdict(lambda: [])
            targets = []
            preprocessed_words = []
            for pos, word in enumerate(tb.words):
                preprocessed_word = query_utils.preprocess_word(
                    word, preprocess_type)
                loc = WordLocation(word=preprocessed_word,
                                   position=pos,
                                   year=document.year,
                                   document=document,
                                   article=article,
                                   textblock_id=tb.textblock_id,
                                   textblock_coords=tb.textblock_coords,
                                   textblock_page_area=tb.textblock_page_area,
                                   textblock_page_name=tb.page_name)
                preprocessed_words.append(preprocessed_word)
                if preprocessed_word in keywords:
                    keys[preprocessed_word].append(loc)
                if preprocessed_word in target_words:
                    targets.append(loc)
            for k, l in keys.items():
                min_distance, target_loc, keyword_loc = get_min_distance_to_target(
                    l, targets)
                if min_distance:
                    matches.append(
                        MatchedWords(target_word=target_loc.word,
                                     keyword=keyword_loc.word,
                                     textblock=target_loc,
                                     distance=min_distance,
                                     words=tb.words,
                                     preprocessed=preprocessed_words))

    return matches
Exemple #4
0
def clean_text(heading, preprocess_type=PreprocessWordType.LEMMATIZE):
    words = []
    for speech in heading.speeches:
        st = speech.text
        if st is not None:
            words += st.split()
    all_text = ''
    for word in words:
        preprocessed_word = query_utils.preprocess_word(word, preprocess_type)
        all_text += (' ' + preprocessed_word)
    return all_text
def preprocess_clean_page_spacy(clean_page,
                                preprocess_type=PreprocessWordType.LEMMATIZE):

    clean_list = clean_page.split(' ')
    page_string = ''
    for word in clean_list:
        preprocessed_word = query_utils.preprocess_word(word, preprocess_type)
        if page_string == '':
            page_string = preprocessed_word
        else:
            page_string += (' ' + preprocessed_word)
    return page_string
def preprocess_clean_article(clean_article,
                             preprocess_type=PreprocessWordType.LEMMATIZE):

    clean_list = clean_article.split(' ')
    article_string = ''
    for word in clean_list:
        preprocessed_word = query_utils.preprocess_word(word, preprocess_type)
        if article_string == '':
            article_string = preprocessed_word
        else:
            article_string += (' ' + preprocessed_word)
    return article_string
Exemple #7
0
def article_stop_words_removal(article,
                               preprocess_type=PreprocessWordType.LEMMATIZE):
    """
    Remove the stop words of an article.

    :param article: Article
    :type article: defoe.papers.article.Article
    :param preprocess_type: how words should be preprocessed
    (normalize, normalize and stem, normalize and lemmatize, none)
    :type preprocess_type: defoe.query_utils.PreprocessWordType
    :return: article words without stop words
    :rtype: list(str or unicode)
    """
    stop_words = set(stopwords.words('english'))
    article_words = []
    for word in article.words:
        preprocessed_word = query_utils.preprocess_word(word, preprocess_type)
        if preprocessed_word not in stop_words:
            article_words.append(preprocessed_word)
    return article_words
def get_page_as_string(page, preprocess_type=PreprocessWordType.LEMMATIZE):
    """
    Return a page as a single string.

    :param page: Page
    :type page: defoe.nls.Page
    :param preprocess_type: how words should be preprocessed
    (normalize, normalize and stem, normalize and lemmatize, none)
    :type preprocess_type: defoe.query_utils.PreprocessWordType
    :return: page words as a string
    :rtype: string or unicode
    """
    page_string = ''
    for word in page.words:
        preprocessed_word = query_utils.preprocess_word(word, preprocess_type)
        if page_string == '':
            page_string = preprocessed_word
        else:
            page_string += (' ' + preprocessed_word)
    return page_string
Exemple #9
0
def get_article_as_string(article,
                          preprocess_type=PreprocessWordType.LEMMATIZE):
    """
    Return an article as a single string.

    :param article: Article
    :type article: defoe.papers.article.Article
    :param preprocess_type: how words should be preprocessed
    (normalize, normalize and stem, normalize and lemmatize, none)
    :type preprocess_type: defoe.query_utils.PreprocessWordType
    :return: article words as a string
    :rtype: string or unicode
    """
    article_string = ''
    for word in article.words:
        preprocessed_word = query_utils.preprocess_word(word, preprocess_type)
        if article_string == '':
            article_string = preprocessed_word
        else:
            article_string += (' ' + preprocessed_word)
    return article_string
def article_contains_word(article,
                          keyword,
                          preprocess_type=PreprocessWordType.LEMMATIZE):
    """
    Check if a keyword occurs within an article.

    :param article: Article
    :type article: defoe.papers.article.Article
    :param keywords: keyword
    :type keywords: str or unicode
    :param preprocess_type: how words should be preprocessed
    (normalize, normalize and stem, normalize and lemmatize, none)
    :type preprocess_type: defoe.query_utils.PreprocessWordType
    :return: True if the article contains the word, false otherwise
    :rtype: bool
    """
    for word in article.words:
        preprocessed_word = query_utils.preprocess_word(word, preprocess_type)
        if keyword == preprocessed_word:
            return True
    return False
Exemple #11
0
def get_concordance(article,
                    keyword,
                    idx,
                    window,
                    preprocess_type=PreprocessWordType.LEMMATIZE):
    """
    For a given keyword (and its position in an article), return
    the concordance of words (before and after) using a window.

    :param article: Article
    :type article: defoe.papers.article.Article
    :param keyword: keyword
    :type keyword: str or unicode
    :param idx: keyword index (position) in list of article's words
    :type idx: int
    :window: number of words to the right and left
    :type: int
    :param preprocess_type: how words should be preprocessed
    (normalize, normalize and stem, normalize and lemmatize, none)
    :type preprocess_type: defoe.query_utils.PreprocessWordType
    :return: concordance
    :rtype: list(str or unicode)
    """
    article_size = len(article.words)

    if idx >= window:
        start_idx = idx - window
    else:
        start_idx = 0

    if idx + window >= article_size:
        end_idx = article_size
    else:
        end_idx = idx + window + 1

    concordance_words = []
    for word in article.words[start_idx:end_idx]:
        concordance_words.append(
            query_utils.preprocess_word(word, preprocess_type))
    return concordance_words
def get_page_matches(document,
                     keywords,
                     preprocess_type=PreprocessWordType.NORMALIZE):
    """
    Get pages within a document that include one or more keywords.
    For each page that includes a specific keyword, add a tuple of
    form:

        (<YEAR>, <DOCUMENT>, <PAGE>, <KEYWORD>)

    If a keyword occurs more than once on a page, there will be only
    one tuple for the page for that keyword.
    If more than one keyword occurs on a page, there will be one tuple
    per keyword.

    :param document: document
    :type document: defoe.nls.document.Document
    :param keywords: keywords
    :type keywords: list(str or unicode:
    :param preprocess_type: how words should be preprocessed
    (normalize, normalize and stem, normalize and lemmatize, none)
    :type preprocess_type: defoe.query_utils.PreprocessWordType
    :return: list of tuples
    :rtype: list(tuple)
    """
    matches = []
    for keyword in keywords:
        for page in document:
            match = None
            for word in page.words:
                preprocessed_word = query_utils.preprocess_word(
                    word, preprocess_type)
                if preprocessed_word == keyword:
                    match = (document.year, document, page, keyword)
                    break
            if match:
                matches.append(match)
                continue  # move to next page
    return matches
def get_article_keywords(article,
                         keywords,
                         preprocess_type=PreprocessWordType.LEMMATIZE):
    """
    Get list of keywords occuring within an article.

    :param article: Article
    :type article: defoe.papers.article.Article
    :param keywords: keywords
    :type keywords: list(str or unicode)
    :param preprocess_type: how words should be preprocessed
    (normalize, normalize and stem, normalize and lemmatize, none)
    :type preprocess_type: defoe.query_utils.PreprocessWordType
    :return: sorted list of keywords that occur within article
    :rtype: list(str or unicode)
    """
    matches = set()
    for word in article.words:
        preprocessed_word = query_utils.preprocess_word(word, preprocess_type)
        if preprocessed_word in keywords:
            matches.add(preprocessed_word)
    return sorted(list(matches))
Exemple #14
0
def get_article_matches(document,
                        keywords,
                        preprocess_type=PreprocessWordType.LEMMATIZE):
    """
        (<YEAR>, <DOCUMENT>, <ARTICLE>, <BLOCK_ID>, <COORDENATES>, <PAGE_AREA>, <ORIGINAL_WORDS>,<PREPROCESSED_WORDS>, <PAGE_NAME>, <KEYWORDS> )
    If a keyword occurs more than once on a page, there will be only
    one tuple for the page for that keyword.
    If more than one keyword occurs on a page, there will be one tuple
    per keyword.
    :param document: document
    :type document: defoe.alto.document.Document
    :param keywords: keywords
    :type keywords: list(str or unicode:
    :param preprocess_type: how words should be preprocessed
    (normalize, normalize and stem, normalize and lemmatize, none)
    :type preprocess_type: defoe.query_utils.PreprocessWordType
    :return: list of tuples
    :rtype: list(tuple)
    """
    matches = []
    document_articles=document.articles
    for keyword in keywords:
        for article in document_articles:
            for tb in document_articles[article]:
                 match = None
                 tb_preprocessed_words=[]
                 for word in tb.words:
                     preprocessed_word = query_utils.preprocess_word(word, preprocess_type)
                     tb_preprocessed_words.append(preprocessed_word)
                 for preprocessed_word in tb_preprocessed_words:
                     if preprocessed_word == keyword:
                         match = (document.year, document, article, tb.textblock_id, tb.textblock_coords, tb.textblock_page_area, tb.words, tb_preprocessed_words, tb.page_name, keyword)
                         break
                 if match:
                     matches.append(match)
                     continue  # move to next article
    return matches
Exemple #15
0
def get_article_keyword_idx(article,
                            keywords,
                            preprocess_type=PreprocessWordType.LEMMATIZE):
    """
    Gets a list of keywords (and their position indices) within an
    article.

    :param article: Article
    :type article: defoe.papers.article.Article
    :param keywords: keywords
    :type keywords: list(str or unicode)
    :param preprocess_type: how words should be preprocessed
    (normalize, normalize and stem, normalize and lemmatize, none)
    :type preprocess_type: defoe.query_utils.PreprocessWordType
    :return: sorted list of keywords and their indices
    :rtype: list(tuple(str or unicode, int))
    """
    matches = set()
    for idx, word in enumerate(article.words):
        preprocessed_word = query_utils.preprocess_word(word, preprocess_type)
        if preprocessed_word in keywords:
            match = (preprocessed_word, idx)
            matches.add(match)
    return sorted(list(matches))
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Counts number of occurrences of keywords or keysentences and groups by year.

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords/keysentences and words in articles are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <YEAR>:
          [
            [<SENTENCE|WORD>, <NUM_SENTENCES|WORDS>],
            ...
          ],
          <YEAR>:
          ...
        }

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, article_string), ...]
    clean_articles = issues.flatMap(
        lambda issue: [(issue.date.year, clean_article_as_string(article))
                       for article in issue.articles])

    # [(year, preprocess_article_string), ...]
    t_articles = clean_articles.flatMap(lambda cl_article: [(cl_article[
        0], preprocess_clean_article(cl_article[1], preprocess_type))])
    # [(year, clean_article_string)
    filter_articles = t_articles.filter(lambda year_article: any(
        keysentence in year_article[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_articles = filter_articles.map(lambda year_article: (year_article[
        0], get_sentences_list_matches(year_article[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_articles.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
Exemple #17
0
def do_query(df, config_file=None, logger=None, context=None):
    """
    Gets concordance using a window of words, for keywords and groups by date.

    Data in HDFS have the following colums:
    
    "title",  "edition", "year", "place", "archive_filename",  "source_text_filename", "text_unit", 
    "text_unit_id", "num_text_unit", "type_archive", "model", "type_page", "header", "term", "definition",
    "num_articles", "num_page_words", "num_article_words", 

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. 

      Returns result of form:
        {
          <YEAR>:
          [
            [- title: 
             - edition:
             - archive_filename:
             - page number:
             - header:
             - term:
             - article:
             - article-definition: ], 
             [], 
            ...
         
          <YEAR>:
          ...
        }
  
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    # Filter out the pages that are null, which model is nls, and select only 2 columns: year and the page as string (either raw or preprocessed).
    fdf = df.withColumn("definition", blank_as_null("definition"))
    #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, term, article_text)
    newdf = fdf.filter(fdf.definition.isNotNull()).filter(
        fdf["model"] == "nlsArticles").select(fdf.year, fdf.title, fdf.edition,
                                              fdf.archive_filename,
                                              fdf.source_text_filename,
                                              fdf.text_unit_id, fdf.type_page,
                                              fdf.header, fdf.term,
                                              fdf.definition)
    articles = newdf.rdd.map(tuple)

    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)

    #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, article, preprocess_article, clean_article)

    preprocess_articles = articles.flatMap(lambda t_articles: [(
        t_articles[0], t_articles[1], t_articles[2], t_articles[3], t_articles[
            4], t_articles[5], t_articles[6], t_articles[7], t_articles[8],
        preprocess_clean_page(t_articles[9], preprocess_type), t_articles[9])])

    filter_articles = preprocess_articles.filter(lambda year_page: any(
        keysentence in year_page[9] for keysentence in keysentences))

    #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, term, article_text, list_sentences)
    matching_articles = filter_articles.map(lambda year_article: (
        year_article[0], year_article[1], year_article[2], year_article[
            3], year_article[4], year_article[5], year_article[
                6], year_article[7], year_article[8], year_article[10],
        get_articles_list_matches(year_article[9], keysentences)))

    #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, term, article_text, sentence)
    matching_sentences = matching_articles.flatMap(
        lambda year_sentence: [(year_sentence[0], year_sentence[1], year_sentence[2], year_sentence[3],
                                year_sentence[4], year_sentence[5], year_sentence[6], year_sentence[7],
                                year_sentence[8], year_sentence[9], sentence)\
                                for sentence in year_sentence[10]])

    matching_data = matching_sentences.map(
        lambda sentence_data: (sentence_data[0], {
            "title": sentence_data[1],
            "edition": sentence_data[2],
            "archive_filename": sentence_data[3],
            "filename": sentence_data[4],
            "page number": sentence_data[5],
            "type_page": sentence_data[6],
            "header": sentence_data[7],
            "term": sentence_data[10],
            "article": sentence_data[8],
            "article-definition": sentence_data[9]
        }))

    # [(date, {"title": title, ...}), ...]
    # =>

    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
def do_query(df, config_file=None, logger=None, context=None):
    """
    Gets concordance for keywords and groups by date.

    Data in ES have the following colums:

    "title",  "edition", "year", "place", "archive_filename", 
    "source_text_filename", "text_unit", "text_unit_id", 
    "num_text_unit", "type_archive", "model", "source_text_raw", 
    "source_text_clean", "source_text_norm", "source_text_lemmatize", "source_text_stem",
    "num_words"

    config_file must be used for indicated the list of words/sentences (lexicon) to search,
    and the preprocess treatment to select from ES. 


    Returns result of form:

        {
          <YEAR>:
          [
            {"title": 
            "edition": 
            "archive_name": 
            "filename": 
            "text": 
            "keysentence": 
            },
            ...
          ],
          <YEAR>:
          ...
        }

    :param issues: RDD of defoe.alto.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    preprocess_config = config["preprocess"]
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))

    # Filter out the pages that are null, which model is nls, and select only 2 columns: year and the page as string (either raw or preprocessed).
    if preprocess_config == "normalize":
        fdf = df.withColumn("source_text_norm",
                            blank_as_null("source_text_norm"))
        newdf = fdf.filter(fdf.source_text_norm.isNotNull()).filter(
            fdf["model"] == "nls").select(fdf.year, fdf.title, fdf.edition,
                                          fdf.archive_filename,
                                          fdf.source_text_filename,
                                          fdf.source_text_norm)
    elif preprocess_config == "lemmatize":
        fdf = df.withColumn("source_text_lemmatize",
                            blank_as_null("source_text_lemmatize"))
        newdf = fdf.filter(fdf.source_text_lemmatize.isNotNull()).filter(
            fdf["model"] == "nls").select(fdf.year, fdf.title, fdf.edition,
                                          fdf.archive_filename,
                                          fdf.source_text_filename,
                                          fdf.source_text_lemmatize)
    elif preprocess_config == "stem":
        fdf = df.withColumn("source_text_stem",
                            blank_as_null("source_text_stem"))
        newdf = fdf.filter(fdf.source_text_stem.isNotNull()).filter(
            fdf["model"] == "nls").select(fdf.year, fdf.title, fdf.edition,
                                          fdf.archive_filename,
                                          fdf.source_text_filename,
                                          fdf.source_text_stem)
    else:
        fdf = df.withColumn("source_text_clean",
                            blank_as_null("source_text_clean"))
        newdf = fdf.filter(fdf.source_text_clean.isNotNull()).filter(
            fdf["model"] == "nls").select(fdf.year, fdf.title, fdf.edition,
                                          fdf.archive_filename,
                                          fdf.source_text_filename,
                                          fdf.source_text_clean)

    pages = newdf.rdd.map(tuple)
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)

    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[5] for keysentence in keysentences))

    # [(year, title, edition, archive, filename, text, keysentence), ...]
    # We also need to convert the string as an integer spliting first the '.
    matching_pages = filter_pages.map(lambda year_page: (
        year_page[0],
        get_pages_matches_no_prep(year_page[1], year_page[2], year_page[3],
                                  year_page[4], year_page[5], keysentences)))

    # [[(year, [title, edition, archive, filename, text, keysentence]), (year, [title, edition, archive, filename, text, keysentence]) ..]

    matching_sentences = matching_pages.flatMap(
        lambda year_sentence: [(year_sentence[0], data_sentence)
                               for data_sentence in year_sentence[1]])

    matching_docs = matching_sentences.map(lambda date_page:
                                           (date_page[0], {
                                               "title": date_page[1][0],
                                               "edition": date_page[1][1],
                                               "archive_name": date_page[1][2],
                                               "filename": date_page[1][3],
                                               "text": date_page[1][4],
                                               "keysentence": date_page[1][5]
                                           }))

    # [(date, {"title": title, ...}), ...]
    # =>
    # [(date, [{"title": title, ...], {...}), ...)]
    result = matching_docs \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Counts the number of occurrences using a list of  keywords or keysentences and 
    filtering those by date. Results are grouped by year.

    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. We can also configure how many target words 
    we want to use, and in which position the lexicon words starts. 
    
    For indicating the number of target words to use from the lexicon file, we can indicate it 
    in the configuration file as, num_target: 1. That means, that we have only one word/sentence
    as the target word (the first one). 
    
    If we want to include the target words in the lexicon, we should indicate in 
    the configuration file as, lexicon_start: 0.
    
    If we do not want to include the target words (lets image that we have just one target word) 
    in the lexicon, we should indicate in the configuration file as, lexicon_start: 1.
    
    Finally, to select the dates that we want to use in this query, we have to indicate them
    in the configuration file as follows:
    
      start_year: YEAR_START (including that year)
      end_year: YEAR_FINISH (including that year)

    Returns result of form:

        {
          <YEAR>:
          [
            [<SENTENCE|WORD>, <NUM_SENTENCES|WORDS>],
            ...
          ],
          <YEAR>:
          ...
        }

    :param issues: RDD of defoe.papers.issue.Issue
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))

    start_year = int(config["start_year"])
    end_year = int(config["end_year"])
    num_target = int(config["num_target"])
    lexicon_start = int(config["lexicon_start"])

    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, article_string), ...]

    target_sentences = keysentences[0:num_target]
    keysentences = keysentences[lexicon_start:]
    clean_articles = issues.flatMap(lambda issue: [(
        issue.date.year, clean_article_as_string(article, defoe_path, os_type)
    ) for article in issue.articles if int(
        issue.date.year) >= start_year and int(issue.date.year) <= end_year])

    # [(year, preprocess_article_string), ...]
    t_articles = clean_articles.flatMap(lambda cl_article: [(cl_article[
        0], preprocess_clean_article(cl_article[1], preprocess_type))])

    # [(year, clean_article_string)
    target_articles = t_articles.filter(lambda year_article: any(
        target_s in year_article[1] for target_s in target_sentences))

    # [(year, clean_article_string)
    filter_articles = target_articles.filter(lambda year_article: any(
        keysentence in year_article[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    # Note: get_articles_list_matches ---> articles count
    # Note: get_sentences_list_matches ---> word_count
    matching_articles = filter_articles.map(lambda year_article: (year_article[
        0], get_articles_list_matches(year_article[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_articles.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Counts number of articles containing both a target word and one or
    more keywords and groups by year.

    Words in articles, target words and keywords can be normalized,
    normalized and stemmed, or normalized and lemmatized (default).

    config_file must be the path to a configuration file of form:

        preprocess: none|normalize|stem|lemmatize # Optional
        data: <DATA_FILE>

    <DATA_FILE> must be the path to a plain-text data file with a list
    of keywords to search for, one per line. The first word is assumed
    to be the target word. If <DATA_FILE> is a relative path then it
    is assumed to be relative to the directory in which config_file
    resides.

    Returns result of form:

        {
          <YEAR>:
          [
            {
              "target_word": <WORD>,
              "words": [<WORD>, <WORD>, ...],
              "count": <COUNT>
            },
            ...
          ],
          <YEAR>:
          ...
        }

    :param issues: RDD of defoe.papers.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keywords = []
    with open(data_file, 'r') as f:
        keywords = [
            query_utils.preprocess_word(word, preprocess_type)
            for word in list(f)
        ]

    target_word = keywords[0]
    keywords = keywords[1:]

    # [(year, article), ...]
    articles = issues.flatMap(lambda issue: [(issue.date.year, article)
                                             for article in issue.articles])
    # [(year, article), ...]
    target_articles = articles.filter(
        lambda year_article: article_contains_word(year_article[
            1], target_word, preprocess_type))
    # [((year, [word, word, ...]), 1), ...]
    words = target_articles.map(lambda year_article: (
        (year_article[0],
         get_article_keywords(year_article[1], keywords, preprocess_type)), 1))
    # [((year, [word, word, ...]), 1), ...]
    match_words = words.filter(
        lambda yearword_count: len(yearword_count[0][1]) > 0)
    # [((year, "target_word, word, word, ..."), 1), ...]
    # Convert word list to string so can serve as a key.
    multi_words = match_words.map(lambda yearword_count: ((yearword_count[0][
        0], ",".join(yearword_count[0][1])), yearword_count[1]))
    # [((year, "word, word, ..."), 1), ...]
    # =>
    # [((year, "word, word, ..."), count), ...]
    # =>
    # [((year, ("word, word, ...", count)), ...]
    # =>
    # [((year, [{"words": "word, word, ...",
    #            "count": count}, ...],
    #          ...]
    # list of words is restored from string of words.
    result = multi_words \
        .reduceByKey(add) \
        .map(lambda yearword_count:
             (yearword_count[0][0],
              (yearword_count[0][1],
               yearword_count[1]))) \
        .groupByKey() \
        .map(lambda year_wordcount:
             (year_wordcount[0],
              word_article_count_list_to_dict(target_word,
                                              year_wordcount[1])))\
        .collect()
    return result
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Gets concordance for keywords and groups by date.

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <DATE>:
          [
            {
              "title": <TITLE>,
              "page_ids": <PAGE_IDS>,
              "content": <PAGE_CONTENT>,
              "word": <WORD>,
              "article_id": <ARTICLE_ID>,
              "issue_id": <ISSUE_ID>,
              "filename": <FILENAME>
            },
            ...
          ],
          <DATE>:
          ...
        }

    :param issues: RDD of defoe.alto.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [query_utils.preprocess_word(
                word, preprocess_type) for word in k_split]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    
    # [(date, issue, article, word), ...]
    filtered_words = issues.flatMap(
        lambda issue: get_article_matches(issue,
                                          keysentences,
                                          PreprocessWordType.NORMALIZE))

    # [(date, issue, article, word, article_clean), ...]
    # =>
    # [(date, {"title": title, ...}), ...]
    matching_docs = filtered_words.map(
        lambda date_issue_article_word:
        (date_issue_article_word[0],
         {"title": date_issue_article_word[2].title_string,
          "page_ids": list(date_issue_article_word[2].page_ids),
          "content": date_issue_article_word[4],
          "word": date_issue_article_word[3],
          "article_id": date_issue_article_word[2].article_id,
          "issue_id": date_issue_article_word[1].newspaper_id,
          "filename": date_issue_article_word[1].filename}))

    # [(date, {"title": title, ...}), ...]
    # =>
    # [(date, [{"title": title, ...], {...}), ...)]
    result = matching_docs \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
Exemple #22
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Counts number of occurrences of keywords or keysentences and groups by words.

    The config_file must indicate the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system.

    Returns result of form:

        {
          <WORD>:
          [
            [<YEAR>, <NUM_WORDS>],
            ...
          ],
          <WORD>:
          ...

        }

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, document), ...]
    documents = archives.flatMap(lambda archive: [(document.year, document) for
                                                  document in list(archive)])
    # [(year, page_string)

    clean_pages = documents.flatMap(lambda year_document: [(year_document[
        0], clean_page_as_string(page, defoe_path, os_type)) for page in
                                                           year_document[1]])
    pages = clean_pages.flatMap(lambda cl_page: [(cl_page[
        0], preprocess_clean_page(cl_page[1], preprocess_type))])
    # [(year, page_string)
    # [(year, page_string)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[1] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    matching_pages = filter_pages.map(lambda year_page: (year_page[
        0], get_sentences_list_matches(year_page[1], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_pages.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(keysentence, (year, num_keysentences)), ...]
    # =>
    # [(keysentence, [year, num_keysentences]), ...]
    result = matching_sentences \
        .reduceByKey(add) \
        .map(lambda yearsentence_count:
             (yearsentence_count[0][1],
              (yearsentence_count[0][0], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
Exemple #23
0
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Counts number of times that each keyword appears for every article
    that has a target word in it.

    Words in articles, target words and keywords can be normalized,
    normalized and stemmed, or normalized and lemmatized (default).

    config_file must be the path to a configuration file of form:

        preprocess: none|normalize|stem|lemmatize # Optional
        data: <DATA_FILE>

    <DATA_FILE> must be the path to a plain-text data file with a list
    of keywords to search for, one per line. The first word is assumed
    to be the target word. If <DATA_FILE> is a relative path then it
    is assumed to be relative to the directory in which config_file
    resides.

    Returns result of form:

        {
            <YEAR>:
            [
                [<WORD>, <NUM_WORDS>],
                [<WORD>, <NUM_WORDS>],
                ...
            ],
            <YEAR>:
            ...
        }

    :param issues: RDD of defoe.papers.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keywords = []
    with open(data_file, 'r') as f:
        keywords = [
            query_utils.preprocess_word(word, preprocess_type)
            for word in list(f)
        ]

    target_word = keywords[0]
    # [(year, article), ...]
    articles = issues.flatMap(lambda issue: [(issue.date.year, article)
                                             for article in issue.articles])
    # [(year, article), ...]
    target_articles = articles.filter(
        lambda year_article: article_contains_word(year_article[
            1], target_word, preprocess_type))

    # [((year, word), 1), ...]
    words = target_articles.flatMap(lambda target_article: [(
        (target_article[0], query_utils.preprocess_word(word, preprocess_type)
         ), 1) for word in target_article[1].words])

    # [((year, word), 1), ...]
    matching_words = words.filter(
        lambda yearword_count: yearword_count[0][1] in keywords)
    # [((year, word), num_words), ...]
    # =>
    # [(year, (word, num_words)), ...]
    # =>
    # [(year, [word, num_words]), ...]
    result = matching_words \
        .reduceByKey(add) \
        .map(lambda yearword_count:
             (yearword_count[0][0],
              (yearword_count[0][1], yearword_count[1]))) \
        .groupByKey() \
        .map(lambda year_wordcount:
             (year_wordcount[0], list(year_wordcount[1]))) \
        .collect()
    return result
def do_query(df, config_file=None, logger=None, context=None):
    """
    Read from HDFS, and counts number of occurrences of keywords or keysentences and groups by year.
    We have an entry in the HFDS file with the following information:   
    - title, edition, year, place, archive filename, page filename, page id, num pages, type of archive, model, type of preprocess treatment, prep_page_string

    Notice, that year is in position "2", preprocess type in poistion "10",
    and the preprocessed page as an string is in position 11. However, the information per entry has been saved as an string.
    
    Example of one entry saved as string. 
    
       u"('Encyclopaedia Britannica', 'Seventh edition, Volume 13, LAB-Magnetism', '1842', 'Edinburgh', 
       '/mnt/lustre/at003/at003/rfilguei2/nls-data-encyclopaediaBritannica/193108323', 'alto/193201394.34.xml', 
       'Page9', '810', 'book', 'nls', 'PreprocessWordType.NORMALIZE', u'the encyclopaedia britannica dictionary of 
        arts sciences and general literature seventh edition i with preliminary dissertations on the history of the 
        sciences and other extensive improvements and additions including the late supplement a general index and 
        numerous engravings volume xiii adam and charles black edinburgh mdcccxlii')"
    
     Therefore,  we need first to recreate a list per entry by spliting each string. 

       [u"'Encyclopaedia Britannica", u" 'Seventh edition, Volume 13, LAB-Magnetism", u" '1842", u" 'Edinburgh", 
       u" '/mnt/lustre/at003/at003/rfilguei2/nls-data-encyclopaediaBritannica/193108323", u" 'alto/193201394.34.xml", 
       u" 'Page9", u" '810", u" 'book", u" 'nls", u" 'PreprocessWordType.NORMALIZE", u" u'the encyclopaedia britannica dictionary of 
       arts sciences and general literature seventh edition i with preliminary dissertations on the history of the sciences and other extensive improvements 
       and additions including the late supplement a general index and numerous engravings volume xiii adam and charles black edinburgh mdcccxlii'"]
   
    And later, for this query we need to get the year (position 2, and convert it into a integer) 
    ,preprocess type (position 10) and page (position 11). 



    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords/keysentences and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <YEAR>:
          [
            [<SENTENCE|WORD>, <NUM_SENTENCES|WORDS>],
            ...
          ],
          <YEAR>:
          ...
        }

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """

    # Reading data from HDFS
    pages_hdfs = context.textFile(hdfs_data)

    # Ignoring the first character '(' and last character ')' of each entry, and spliting by "',"
    pages = pages_hdfs.map(lambda p_string: p_string[1:-1].split("\',"))

    # Cleaning the first ' of each element.
    pages_clean = pages.map(
        lambda p_entry: [item.split("\'")[1] for item in p_entry])

    # Getting the preprocess type from the first entry - position 10.
    f_entry = pages_clean.take(1)
    preprocess_type = f_entry[0][10]

    with open(config_file, "r") as f:
        config = yaml.load(f)

    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)

    filter_pages = pages_clean.filter(lambda year_page: any(
        keysentence in year_page[11] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    # We also need to convert the string as an integer spliting first the '.
    matching_pages = filter_pages.map(lambda year_page: (int(year_page[
        2]), get_sentences_list_matches(year_page[11], keysentences)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_pages.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda year_sentencecount:
             (year_sentencecount[0], list(year_sentencecount[1]))) \
        .collect()
    return result
def do_query(issues, config_file=None, logger=None, context=None):

    print('Loading config')
    with open(config_file, "r") as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
    print(f'config: {config}')

    if sys.platform == "linux":
        os_type = "sys-i386-64"
    else:
        os_type = "sys-i386-snow-leopard"
    print(f'platform: {sys.platform}')

    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    print(f'preprocessing: {preprocess_type}')

    unproc_keywords = config['keywords']
    keywords = []
    for k in unproc_keywords:
        keywords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in k.split()
        ]))
    print(f'keywords: {keywords}')

    unproc_targetwords = config['targetwords']
    targetwords = []
    for t in unproc_targetwords:
        targetwords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in t.split()
        ]))
    print(f'targetwords: {targetwords}')

    # [(year, article_string), ...]
    clean_articles = issues.flatMap(
        lambda issue: [(issue.date.year, issue, article,
                        clean_article_as_string(article, defoe_path, os_type))
                       for article in issue.articles])

    # [(year, preprocess_article_string), ...]
    preprocessed_articles = clean_articles.flatMap(
        lambda cl_article: [(cl_article[0], cl_article[1], cl_article[
            2], preprocess_clean_article(cl_article[3], preprocess_type))])

    # [(year, clean_article_string)
    filter_articles = preprocessed_articles.filter(
        lambda year_article: any(t in year_article[3] for t in targetwords))

    # [(year, [keysentence, keysentence]), ...]
    matching_articles = filter_articles.flatMap(
        lambda year_article: [(year_article[0], year_article[1], year_article[
            2], k) for k in find_matches(year_article[3], keywords)])

    matching_data = matching_articles.map(
        lambda sentence_data: (sentence_data[0], {
            "title": sentence_data[2].title_string,
            "article_id": sentence_data[2].article_id,
            "page_ids": list(sentence_data[2].page_ids),
            "section": sentence_data[2].ct,
            "keyword": sentence_data[3],
            "targets": list(targetwords),
            "issue_id": sentence_data[1].newspaper_id,
            "filename": sentence_data[1].filename
        }))

    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Select the articles text along with metadata by using a list of 
    keywords or keysentences and groups by year. In the result file we
    store the preproessed text instead of the original text. 
    
    config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    Also the config_file can indicate the preprocess treatment, along with the defoe
    path, and the type of operating system. We can also configure how many target words 
    we want to use, and in which position the lexicon words starts. 
    
    For indicating the number of target words to use from the lexicon file, we can indicate it 
    in the configuration file as, num_target: 1. That means, that we have only one word/sentence
    as the target word (the first one). 
    
    If we want to include the target words in the lexicon, we should indicate in 
    the configuration file as, lexicon_start: 0.
    
    If we do not want to include the target words (lets image that we have just one target word) 
    in the lexicon, we should indicate in the configuration file as, lexicon_start: 1.

    Returns result of form:

        {
          <YEAR>:
          [
            [- article_id: 
             - authors:
             - filename:
             - issue_id:
             - page_ids:
             - preprocessed_text:
             - term
             - title ]
            ...
          ],
          <YEAR>:
          ...
        }
        


    :param issues: RDD of defoe.papers.issue.Issue
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: number of occurrences of keywords grouped by year
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    num_target = int(config["num_target"])
    lexicon_start = int(config["lexicon_start"])
    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)
    # [(year, article_string), ...]

    target_sentences = keysentences[0:num_target]
    keysentences = keysentences[lexicon_start:]
    clean_articles = issues.flatMap(
        lambda issue: [(issue.date.year, issue, article,
                        clean_article_as_string(article, defoe_path, os_type))
                       for article in issue.articles])

    # [(year, preprocess_article_string), ...]
    t_articles = clean_articles.flatMap(
        lambda cl_article: [(cl_article[0], cl_article[1], cl_article[
            2], preprocess_clean_article(cl_article[3], preprocess_type))])

    # [(year, clean_article_string)
    target_articles = t_articles.filter(lambda year_article: any(
        target_s in year_article[3] for target_s in target_sentences))

    # [(year, clean_article_string)
    filter_articles = target_articles.filter(lambda year_article: any(
        keysentence in year_article[3] for keysentence in keysentences))

    # [(year, [keysentence, keysentence]), ...]
    # Note: get_articles_list_matches ---> articles count
    # Note: get_sentences_list_matches ---> word_count

    matching_articles = filter_articles.map(lambda year_article: (
        year_article[0], year_article[1], year_article[2], year_article[3],
        get_articles_list_matches(year_article[3], keysentences)))

    matching_sentences = matching_articles.flatMap(
        lambda year_sentence: [(year_sentence[0], year_sentence[1], year_sentence[2], year_sentence[3], sentence)\
                                for sentence in year_sentence[4]])

    matching_data = matching_sentences.map(
        lambda sentence_data: (sentence_data[0], {
            "title": sentence_data[2].title_string,
            "article_id:": sentence_data[2].article_id,
            "authors:": sentence_data[2].authors_string,
            "page_ids": list(sentence_data[2].page_ids),
            "term": sentence_data[4],
            "preprocessed text": sentence_data[3],
            "issue_id": sentence_data[1].newspaper_id,
            "filename": sentence_data[1].filename
        }))

    # [(date, {"title": title, ...}), ...]
    # =>

    result = matching_data \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result
Exemple #27
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Gets concordance for keywords and groups by word.

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <WORD>:
          [
            { "article_id": <ARTICLE ID>,
              "issue_filename": <ISSUE.ZIP>, 
              "coord": <COORDENATES>, 
              "page_area": <PAGE AREA>,
              "page_filename": < PAGE FILENAME>,
              "place": <PLACE>,
              "textblock_id": <TEXTBLOCK ID>,
              "title": <TITLER>,
              "words": <WORDS>
              "year": <YEAR>,
            },
            ...
          ],
          <WORD>:
          ...
        }

    :param archives: RDD of defoe.fmp.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by word
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))

    keywords = []
    with open(data_file, 'r') as f:
        keywords = [
            query_utils.preprocess_word(word, preprocess_type)
            for word in list(f)
        ]

    # [document, ...]
    documents = archives.flatMap(
        lambda archive: [document for document in list(archive)])

    filtered_words = documents.flatMap(lambda document: get_article_matches(
        document, keywords, preprocess_type))
    #[(year, document, article, textblock_id, textblock_coords, textblock_page_area, words, page_name, keyword), ....]
    # =>
    # [(word, {"article_id": article_id, ...}), ...]
    matching_docs = filtered_words.map(lambda document_article_word: (
        document_article_word[8], {
            "title": document_article_word[1].title,
            "place": document_article_word[1].place,
            "article_id": document_article_word[2],
            "textblock_id": document_article_word[3],
            "coord": document_article_word[4],
            "page_area": document_article_word[5],
            "year": document_article_word[0],
            "words": document_article_word[6],
            "page_filename": document_article_word[7],
            "issue_filename": document_article_word[1].archive.filename
        }))

    # [(word, {"article_id": article_id, ...}), ...]
    # =>
    # [(word, [{"article_id": article_id, ...], {...}), ...)]
    result = matching_docs \
        .groupByKey() \
        .map(lambda word_context:
             (word_context[0], list(word_context[1]))) \
        .collect()
    return result
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Gets concordance using a window of words, for keywords and groups by date.

    Data in ES have the following colums:

    "title",  "edition", "year", "place", "archive_filename", 
    "source_text_filename", "text_unit", "text_unit_id", 
    "num_text_unit", "type_archive", "model", "source_text_raw", 
    "source_text_clean", "source_text_norm", "source_text_lemmatize", "source_text_stem",
    "num_words"

    config_file must be the path to a configuration file with a list
    of the keywords to search for, one per line.

    Both keywords and words in documents are normalized, by removing
    all non-'a-z|A-Z' characters.

    Returns result of form:
          [(year, [(title, edition, archive_filename, filename, word,corcondance),
              (title, edition, archive_filename, filename, word, concordance ), ...]), ...]


    :param issues: RDD of defoe.alto.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by date
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type = "sys-i386-snow-leopard"
    else:
        os_type = "sys-i386-64"
    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    window = 20
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    preprocess_config = config["preprocess"]
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))

    #newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.source_text_clean)

    keysentences = []
    with open(data_file, 'r') as f:
        for keysentence in list(f):
            k_split = keysentence.split()
            sentence_word = [
                query_utils.preprocess_word(word, preprocess_type)
                for word in k_split
            ]
            sentence_norm = ''
            for word in sentence_word:
                if sentence_norm == '':
                    sentence_norm = word
                else:
                    sentence_norm += " " + word
            keysentences.append(sentence_norm)

    # [(year, document, title, edition, archive_filename), ...]
    documents = archives.flatMap(lambda archive: [(
        document.year, document, document.title, document.edition, document.
        archive.filename) for document in list(archive)])
    # [(year, page_string)

    #(year, title, edition, archive_filename, page_code, clean_page_string)
    clean_pages = documents.flatMap(lambda year_document: [(year_document[
        0], year_document[2], year_document[3], year_document[
            4], page.code, clean_page_as_string(
                page, defoe_path, os_type)) for page in year_document[1]])

    #(year, title, edition, archive_filename, page_code, preprocess_clean_page)
    pages = clean_pages.flatMap(lambda cl_page: [(
        cl_page[0], cl_page[1], cl_page[2], cl_page[3], cl_page[4],
        preprocess_clean_page(cl_page[5], preprocess_type))])
    #(year, title, edition, archive_filename, page_code, preprocess_clean_page)
    filter_pages = pages.filter(lambda year_page: any(
        keysentence in year_page[5] for keysentence in keysentences))

    # [(year, title, edition, archive_filename, filename, text, [(word, idx), (word, idx) ...]), ...]
    maching_idx = filter_pages.map(lambda year_page: (
        (year_page[0], year_page[1], year_page[2], year_page[3], year_page[4],
         year_page[5], get_text_keyword_idx(year_page[5], keysentences))))

    # [(year, [(title, edition, archive_filename, filename, word, [concordance, ...]), ...])]
    concordance_words = maching_idx.flatMap(
        lambda year_idx: [
            (year_idx[0],
                {"title":year_idx[1], "edition": year_idx[2], "archive_filename": year_idx[3], "filename":year_idx[4], "term": word_idx[0],\
                 "snippet": get_concordance_string(year_idx[5], word_idx[0], word_idx[1], window)})\
                 for word_idx in year_idx[6]])

    result = concordance_words.groupByKey() \
        .map(lambda year_match:
             (year_match[0], list(year_match[1]))) \
        .collect()
    return result
Exemple #29
0
def do_query(issues, config_file=None, logger=None, context=None):
    print('Loading config')
    with open(config_file, "r") as f:
        config = yaml.load(f)
    print(f'config: {config}')

    if sys.platform == "linux":
        os_type = "sys-i386-64"
    else:
        os_type = "sys-i386-snow-leopard"
    print(f'platform: {sys.platform}')

    if "defoe_path" in config:
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    preprocess_type = query_utils.extract_preprocess_word_type(config)
    print(f'preprocessing: {preprocess_type}')

    unproc_keywords = config['keywords']
    keywords = []
    for k in unproc_keywords:
        keywords.append(' '.join([
            query_utils.preprocess_word(word, preprocess_type)
            for word in k.split()
        ]))
    print(f'keywords: {keywords}')

    clean_articles = issues.flatMap(lambda issue: [(
        issue.date.year, clean_article_as_string(
            article, defoe_path, os_type)) for article in issue.articles])

    preprocessed_articles = clean_articles.map(lambda cl_article: (cl_article[
        0], preprocess_clean_article(cl_article[1], preprocess_type)))

    # [(year, article_string)
    filter_articles = preprocessed_articles.filter(
        lambda year_article: any(k in year_article[1] for k in keywords))

    # [(year, [keysentence, keysentence]), ...]
    # Note: get_articles_list_matches ---> articles count
    # Note: get_sentences_list_matches ---> word_count
    matching_articles = filter_articles.map(lambda year_article: (year_article[
        0], get_sentences_list_matches(year_article[1], keywords)))

    # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...]
    matching_sentences = matching_articles.flatMap(lambda year_sentence: [(
        (year_sentence[0], sentence), 1) for sentence in year_sentence[1]])

    # [((year, keysentence), num_keysentences), ...]
    # =>
    # [(year, (keysentence, num_keysentences)), ...]
    # =>
    # [(year, [keysentence, num_keysentences]), ...]
    result = matching_sentences\
        .reduceByKey(add)\
        .map(lambda yearsentence_count:
             (yearsentence_count[0][0],
              (yearsentence_count[0][1], yearsentence_count[1]))) \
        .groupByKey() \
        .map(lambda a: (a[0], [{x[0]: x[1]} for x in a[1]])) \
        .collect()
    return result
Exemple #30
0
def do_query(archives, config_file=None, logger=None, context=None):
    """
    Crops articles' images for keywords and groups by word.

    Config_file must a yml file that has the following values:
        * preprocess: Treatment to use for preprocessing the words. Options: [normalize|stem|lemmatize|none]
        * data: TXT file with a list of the keywords to search for, one per line. 
                This should be in the same path at the configuration file.

        Important!! : The first two words in this list are treated as targetwords.

        *years_filter: Min and Max years to filter the data. Separeted by "-"
        *output_path: The path to store the cropped images.

    Returns result of form:

        {
          <WORD>:
          [
            { "article_id": <ARTICLE ID>,
              "issue_filename": <ISSUE.ZIP>, 
              "issue_id": <ISSUE ID>
              "coord": <COORDENATES>,
              "cropped_image": <IMAGE.JPG> 
              "page_area": <PAGE AREA>,
              "page_filename": < PAGE FILENAME>,
              "place": <PLACE>,
              "textblock_id": <TEXTBLOCK ID>,
              "title": <TITLER>,
              "words": <WORDS>,
              "preprocessed_words": <PREPROCESSED WORDS> 
              "year": <YEAR>,
              "date": <DATE>
            },
            ...
          ],
          <WORD>:
          ...
        }
    :param archives: RDD of defoe.fmp.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: information on documents in which keywords occur grouped
    by word
    :rtype: dict
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    preprocess_type = query_utils.extract_preprocess_word_type(config)
    data_file = query_utils.extract_data_file(config,
                                              os.path.dirname(config_file))
    year_min, year_max=query_utils.extract_years_filter(config)
    output_path = query_utils.extract_output_path(config)
    keywords = []
    with open(data_file, 'r') as f:
        keywords = [query_utils.preprocess_word(word, preprocess_type)
                    for word in list(f)]

    
    #We can change the following line, if we want to include more or less words as target_words.
    #In this case, the first two words of the lexicon are selected as target_words
    target_words = keywords[0:2]
    #The rest of words of the lexicon are selected as keywords
    keywords = keywords[2:]
    # [document, ...]

    # We will select/filter the texblocks that follows this rule: The text contains at least one target words AND one keyword. 
    

    documents = archives.flatMap(
        lambda archive: [document for document in list(archive) if document.year >= int(year_min) and document.year <= int(year_max) ])

    filtered_tb = documents.flatMap(
        lambda document: get_article_matches(document, target_words, preprocess_type))
    
    filtered_words = filtered_tb.flatMap(lambda tb: get_tb_matches(tb, keywords))

    #[(year, document, article, textblock_id, textblock_coords, textblock_page_area, words, preprocessed_words, page_name, keyword,target), ....]
    # [(word, {"article_id": article_id, ...}), ...]
    matching_docs = filtered_words.map(
        lambda document_article_word:
        (document_article_word[9],
         {"title": document_article_word[1].title,
          "place": document_article_word[1].place,
          "article_id": document_article_word[2],
          "textblock_id": document_article_word[3], 
          "coord": document_article_word[4],
          "page_area": document_article_word[5],
          "year": document_article_word[0],
          "words":  document_article_word[6],
          "date":  document_article_word[1].date,
          "preprocessed_words":  document_article_word[7],
          "page_filename":  document_article_word[8],
          "issue_id": document_article_word[1].documentId,
          "issue_dirname": document_article_word[1].archive.filename,
          "target_word": document_article_word[10],
          "cropped_image": segment_image(document_article_word[4], document_article_word[8], document_article_word[1].archive.filename, document_article_word[9], output_path, document_article_word[10])
         }))


    # [(word, {"article_id": article_id, ...}), ...]
    # =>
    # [(word, [{"article_id": article_id, ...], {...}), ...)]
    result = matching_docs \
        .groupByKey() \
        .map(lambda word_context:
             (word_context[0], list(word_context[1]))) \
        .collect()
    return result