def do_query(archives, config_file=None, logger=None, context=None): """ Ingest NLS pages, applies all 4 preprocess treatments (none, normalize, lemmatize, stem) to each page, and save them to PostgreSQL table, with some metadata associated with each page. Metadata collected: tittle, edition, year, place, archive filename, page filename, page id, num pages, type of archive, model, source_text_raw, source_text_norm, source_text_lemmatize, source_text_stem, num_page_words Data is saved as Dataframes into PostgreSQL table Example: ('Encyclopaedia Britannica,"Seventh edition, Volume 13, LAB-Magnetism",1842,Edinburgh,/mnt/lustre/at003/at003/rfilguei2/nls-data-encyclopaediaBritannica/193108323,alto/193201394.34.xml,page,Page9,810,book,nls,"THE ENCYCLOPAEDIA BRITANNICA DICTIONARY OF ARTS, SCIENCES, AND GENERAL LITERATURE. SEVENTH EDITION, i WITH PRELIMINARY DISSERTATIONS ON THE HISTORY OF THE SCIENCES, AND OTHER EXTENSIVE IMPROVEMENTS AND ADDITIONS; INCLUDING THE LATE SUPPLEMENT. A GENERAL INDEX, AND NUMEROUS ENGRAVINGS. VOLUME XIII. ADAM AND CHARLES BLACK, EDINBURGH; M.DCCC.XLII.","THE ENCYCLOPAEDIA BRITANNICA DICTIONARY OF ARTS, SCIENCES, AND GENERAL LITERATURE. SEVENTH EDITION, i WITH PRELIMINARY DISSERTATIONS ON THE HISTORY OF THE SCIENCES, AND OTHER EXTENSIVE IMPROVEMENTS AND ADDITIONS; INCLUDING THE LATE SUPPLEMENT. A GENERAL INDEX, AND NUMEROUS ENGRAVINGS. VOLUME XIII. ADAM AND CHARLES BLACK, EDINBURGH; M.DCCC.XLII.",the encyclopaedia britannica dictionary of arts sciences and general literature seventh edition i with preliminary dissertations on the history of the sciences and other extensive improvements and additions including the late supplement a general index and numerous engravings volume xiii adam and charles black edinburgh mdcccxlii,the encyclopaedia britannica dictionary of art science and general literature seventh edition i with preliminary dissertation on the history of the science and other extensive improvement and addition including the late supplement a general index and numerous engraving volume xiii adam and charles black edinburgh mdcccxlii,the encyclopaedia britannica dictionari of art scienc and gener literatur seventh edit i with preliminari dissert on the histori of the scienc and other extens improv and addit includ the late supplement a gener index and numer engrav volum xiii adam and charl black edinburgh mdcccxlii,46') :param archives: RDD of defoe.nls.archive.Archive :type archives: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: "0" :rtype: string """ preprocess_none = query_utils.parse_preprocess_word_type("none") preprocess_normalize = query_utils.parse_preprocess_word_type("normalize") preprocess_lemmatize = query_utils.parse_preprocess_word_type("lemmatize") preprocess_stem = query_utils.parse_preprocess_word_type("stem") text_unit = "page" # [(tittle, edition, year, place, archive filename, page filename, # page id, num pages, type of archive, type of disribution, model)] documents = archives.flatMap( lambda archive: [(document.title, document.edition, document.year, \ document.place, document.archive.filename, document.num_pages, \ document.document_type, document.model, document) for document in list(archive)]) # [(tittle, edition, year, place, archive filename, page filename, text_unit, text_unit_id, # num_text_unit, type of archive, type of disribution, model, raw_page, clean_page, num_words)] pages_clean = documents.flatMap( lambda year_document: [(year_document[0], year_document[1], year_document[2],\ year_document[3], year_document[4], page.code, text_unit, page.page_id, \ year_document[5], year_document[6], year_document[7], get_page_as_string(page, preprocess_none), \ clean_page_as_string(page), len(page.words)) for page in year_document[8]]) # [(tittle, edition, year, place, archive filename, page filename, text_unit, text_unit_id, # num_text_unit, type of archive, type of disribution, model, raw_page, clean_page, clean_norm_page, clean_lemma_page, clean_stemm_page, num_words)] pages = pages_clean.flatMap( lambda clean_page: [(clean_page[0], clean_page[1], clean_page[2],\ clean_page[3], clean_page[4], clean_page[5], clean_page[6], clean_page[7], \ clean_page[8], clean_page[9], clean_page[10], clean_page[11],\ clean_page[12], preprocess_clean_page(clean_page[12], preprocess_normalize),\ preprocess_clean_page(clean_page[12], preprocess_lemmatize), preprocess_clean_page(clean_page[12], preprocess_stem), clean_page[13])]) nlsRow=Row("title", "edition", "year", "place", "archive_filename", "source_text_filename", "text_unit", "text_unit_id", "num_text_unit", "type_archive", "model", "source_text_raw", "source_text_clean", "source_text_norm", "source_text_lemmatize", "source_text_stem", "num_words") sqlContext = SQLContext(context) df = sqlContext.createDataFrame(pages,nlsRow) with open(config_file, "r") as f: config = yaml.load(f) url = "jdbc:postgresql://%s:%s/%s" % (config["host"],config["port"],config["database"]) properties = {"user": config["user"] ,"driver": config["driver"]} mode = "overwrite" df.write.jdbc(url=url, table=config["table"], mode=mode, properties=properties) return "0"
def do_query(archives, config_file=None, logger=None, context=None): """ Gets concordance using a window of words, for keywords and groups by date. Data in ES have the following colums: "title", "edition", "year", "place", "archive_filename", "source_text_filename", "text_unit", "text_unit_id", "num_text_unit", "type_archive", "model", "source_text_raw", "source_text_clean", "source_text_norm", "source_text_lemmatize", "source_text_stem", "num_words" config_file must be the path to a configuration file with a list of the keywords to search for, one per line. Both keywords and words in documents are normalized, by removing all non-'a-z|A-Z' characters. Returns result of form: [(year, [(title, edition, archive_filename, filename, word,corcondance), (title, edition, archive_filename, filename, word, concordance ), ...]), ...] :param issues: RDD of defoe.alto.issue.Issue :type issues: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: information on documents in which keywords occur grouped by date :rtype: dict """ with open(config_file, "r") as f: config = yaml.load(f) if "os_type" in config: if config["os_type"] == "linux": os_type = "sys-i386-64" else: os_type = "sys-i386-snow-leopard" else: os_type = "sys-i386-64" if "defoe_path" in config: defoe_path = config["defoe_path"] else: defoe_path = "./" window = 20 preprocess_type = query_utils.extract_preprocess_word_type(config) preprocess_config = config["preprocess"] data_file = query_utils.extract_data_file(config, os.path.dirname(config_file)) #newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.source_text_clean) keysentences = [] with open(data_file, 'r') as f: for keysentence in list(f): k_split = keysentence.split() sentence_word = [ query_utils.preprocess_word(word, preprocess_type) for word in k_split ] sentence_norm = '' for word in sentence_word: if sentence_norm == '': sentence_norm = word else: sentence_norm += " " + word keysentences.append(sentence_norm) # [(year, document, title, edition, archive_filename), ...] documents = archives.flatMap(lambda archive: [( document.year, document, document.title, document.edition, document. archive.filename) for document in list(archive)]) # [(year, page_string) #(year, title, edition, archive_filename, page_code, clean_page_string) clean_pages = documents.flatMap(lambda year_document: [(year_document[ 0], year_document[2], year_document[3], year_document[ 4], page.code, clean_page_as_string( page, defoe_path, os_type)) for page in year_document[1]]) #(year, title, edition, archive_filename, page_code, preprocess_clean_page) pages = clean_pages.flatMap(lambda cl_page: [( cl_page[0], cl_page[1], cl_page[2], cl_page[3], cl_page[4], preprocess_clean_page(cl_page[5], preprocess_type))]) #(year, title, edition, archive_filename, page_code, preprocess_clean_page) filter_pages = pages.filter(lambda year_page: any( keysentence in year_page[5] for keysentence in keysentences)) # [(year, title, edition, archive_filename, filename, text, [(word, idx), (word, idx) ...]), ...] maching_idx = filter_pages.map(lambda year_page: ( (year_page[0], year_page[1], year_page[2], year_page[3], year_page[4], year_page[5], get_text_keyword_idx(year_page[5], keysentences)))) # [(year, [(title, edition, archive_filename, filename, word, [concordance, ...]), ...])] concordance_words = maching_idx.flatMap( lambda year_idx: [ (year_idx[0], {"title":year_idx[1], "edition": year_idx[2], "archive_filename": year_idx[3], "filename":year_idx[4], "term": word_idx[0],\ "snippet": get_concordance_string(year_idx[5], word_idx[0], word_idx[1], window)})\ for word_idx in year_idx[6]]) result = concordance_words.groupByKey() \ .map(lambda year_match: (year_match[0], list(year_match[1]))) \ .collect() return result
def do_query(archives, config_file=None, logger=None, context=None): """ Counts number of occurrences of keywords or keysentences and groups by words. The config_file must indicate the path to a lexicon file with a list of the keywords to search for, one per line. Also the config_file can indicate the preprocess treatment, along with the defoe path, and the type of operating system. Returns result of form: { <WORD>: [ [<YEAR>, <NUM_WORDS>], ... ], <WORD>: ... } :param archives: RDD of defoe.nls.archive.Archive :type archives: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: number of occurrences of keywords grouped by year :rtype: dict """ with open(config_file, "r") as f: config = yaml.load(f) if "os_type" in config: if config["os_type"] == "linux": os_type = "sys-i386-64" else: os_type = "sys-i386-snow-leopard" else: os_type = "sys-i386-64" if "defoe_path" in config: defoe_path = config["defoe_path"] else: defoe_path = "./" preprocess_type = query_utils.extract_preprocess_word_type(config) data_file = query_utils.extract_data_file(config, os.path.dirname(config_file)) keysentences = [] with open(data_file, 'r') as f: for keysentence in list(f): k_split = keysentence.split() sentence_word = [ query_utils.preprocess_word(word, preprocess_type) for word in k_split ] sentence_norm = '' for word in sentence_word: if sentence_norm == '': sentence_norm = word else: sentence_norm += " " + word keysentences.append(sentence_norm) # [(year, document), ...] documents = archives.flatMap(lambda archive: [(document.year, document) for document in list(archive)]) # [(year, page_string) clean_pages = documents.flatMap(lambda year_document: [(year_document[ 0], clean_page_as_string(page, defoe_path, os_type)) for page in year_document[1]]) pages = clean_pages.flatMap(lambda cl_page: [(cl_page[ 0], preprocess_clean_page(cl_page[1], preprocess_type))]) # [(year, page_string) # [(year, page_string) filter_pages = pages.filter(lambda year_page: any( keysentence in year_page[1] for keysentence in keysentences)) # [(year, [keysentence, keysentence]), ...] matching_pages = filter_pages.map(lambda year_page: (year_page[ 0], get_sentences_list_matches(year_page[1], keysentences))) # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...] matching_sentences = matching_pages.flatMap(lambda year_sentence: [( (year_sentence[0], sentence), 1) for sentence in year_sentence[1]]) # [((year, keysentence), num_keysentences), ...] # => # [(keysentence, (year, num_keysentences)), ...] # => # [(keysentence, [year, num_keysentences]), ...] result = matching_sentences \ .reduceByKey(add) \ .map(lambda yearsentence_count: (yearsentence_count[0][1], (yearsentence_count[0][0], yearsentence_count[1]))) \ .groupByKey() \ .map(lambda year_sentencecount: (year_sentencecount[0], list(year_sentencecount[1]))) \ .collect() return result
def do_query(df, config_file=None, logger=None, context=None): """ Gets concordance using a window of words, for keywords and groups by date. Data in HDFS have the following colums: "title", "edition", "year", "place", "archive_filename", "source_text_filename", "text_unit", "text_unit_id", "num_text_unit", "type_archive", "model", "type_page", "header", "term", "definition", "num_articles", "num_page_words", "num_article_words", config_file must be the path to a lexicon file with a list of the keywords to search for, one per line. Also the config_file can indicate the preprocess treatment, along with the defoe path, and the type of operating system. Returns result of form: { <YEAR>: [ [- title: - edition: - archive_filename: - page number: - header: - term: - article: - article-definition: ], [], ... <YEAR>: ... } :type issues: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: information on documents in which keywords occur grouped by date :rtype: dict """ with open(config_file, "r") as f: config = yaml.load(f) preprocess_type = query_utils.extract_preprocess_word_type(config) data_file = query_utils.extract_data_file(config, os.path.dirname(config_file)) # Filter out the pages that are null, which model is nls, and select only 2 columns: year and the page as string (either raw or preprocessed). fdf = df.withColumn("definition", blank_as_null("definition")) #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, term, article_text) newdf = fdf.filter(fdf.definition.isNotNull()).filter( fdf["model"] == "nlsArticles").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.text_unit_id, fdf.type_page, fdf.header, fdf.term, fdf.definition) articles = newdf.rdd.map(tuple) keysentences = [] with open(data_file, 'r') as f: for keysentence in list(f): k_split = keysentence.split() sentence_word = [ query_utils.preprocess_word(word, preprocess_type) for word in k_split ] sentence_norm = '' for word in sentence_word: if sentence_norm == '': sentence_norm = word else: sentence_norm += " " + word keysentences.append(sentence_norm) #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, article, preprocess_article, clean_article) preprocess_articles = articles.flatMap(lambda t_articles: [( t_articles[0], t_articles[1], t_articles[2], t_articles[3], t_articles[ 4], t_articles[5], t_articles[6], t_articles[7], t_articles[8], preprocess_clean_page(t_articles[9], preprocess_type), t_articles[9])]) filter_articles = preprocess_articles.filter(lambda year_page: any( keysentence in year_page[9] for keysentence in keysentences)) #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, term, article_text, list_sentences) matching_articles = filter_articles.map(lambda year_article: ( year_article[0], year_article[1], year_article[2], year_article[ 3], year_article[4], year_article[5], year_article[ 6], year_article[7], year_article[8], year_article[10], get_articles_list_matches(year_article[9], keysentences))) #(year, title, edition, archive_filename, page_filename, page_number, type of page, header, term, article_text, sentence) matching_sentences = matching_articles.flatMap( lambda year_sentence: [(year_sentence[0], year_sentence[1], year_sentence[2], year_sentence[3], year_sentence[4], year_sentence[5], year_sentence[6], year_sentence[7], year_sentence[8], year_sentence[9], sentence)\ for sentence in year_sentence[10]]) matching_data = matching_sentences.map( lambda sentence_data: (sentence_data[0], { "title": sentence_data[1], "edition": sentence_data[2], "archive_filename": sentence_data[3], "filename": sentence_data[4], "page number": sentence_data[5], "type_page": sentence_data[6], "header": sentence_data[7], "term": sentence_data[10], "article": sentence_data[8], "article-definition": sentence_data[9] })) # [(date, {"title": title, ...}), ...] # => result = matching_data \ .groupByKey() \ .map(lambda date_context: (date_context[0], list(date_context[1]))) \ .collect() return result
def do_query(archives, config_file=None, logger=None, context=None): """ The query counts as a “hit” every time that finds a page with a particular term from a lexicon and it groups the results by years. The config_file must indicate the path to a lexicon file with a list of the keywords to search for, one per line. Also the config_file can indicate the preprocess treatment, along with the defoe path, and the type of operating system. If a term appears several times in the same page, it will be still counted as “1”. Example: 1795: - - kail - 1 - - aff - 4 - - lairds - 1 That means that kail appears in 1 page , aff in 4 pages and lairds in 1 page across all the books in the year 1795. :param archives: RDD of defoe.nls.archive.Archive :type archives: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: number of occurrences of keywords grouped by year :rtype: dict """ with open(config_file, "r") as f: config = yaml.load(f) if "os_type" in config: if config["os_type"] == "linux": os_type = "sys-i386-64" else: os_type= "sys-i386-snow-leopard" else: os_type = "sys-i386-64" if "defoe_path" in config : defoe_path= config["defoe_path"] else: defoe_path = "./" preprocess_type = query_utils.extract_preprocess_word_type(config) data_file = query_utils.extract_data_file(config, os.path.dirname(config_file)) keysentences = [] with open(data_file, 'r') as f: for keysentence in list(f): k_split = keysentence.split() sentence_word = [query_utils.preprocess_word( word, preprocess_type) for word in k_split] sentence_norm = '' for word in sentence_word: if sentence_norm == '': sentence_norm = word else: sentence_norm += " " + word keysentences.append(sentence_norm) # [(year, document), ...] documents = archives.flatMap( lambda archive: [(document.year, document) for document in list(archive)]) # [(year, page_string) clean_pages = documents.flatMap( lambda year_document: [(year_document[0], clean_page_as_string(page, defoe_path, os_type)) for page in year_document[1]]) pages = clean_pages.flatMap( lambda cl_page: [(cl_page[0], preprocess_clean_page(cl_page[1], preprocess_type))]) # [(year, page_string) # [(year, page_string) filter_pages = pages.filter( lambda year_page: any( keysentence in year_page[1] for keysentence in keysentences)) # [(year, [keysentence, keysentence]), ...] matching_pages = filter_pages.map( lambda year_page: (year_page[0], get_sentences_list_matches( year_page[1], keysentences))) # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...] matching_sentences = matching_pages.flatMap( lambda year_sentence: [((year_sentence[0], sentence), 1) for sentence in year_sentence[1]]) # [((year, keysentence), num_keysentences), ...] # => # [(year, (keysentence, num_keysentences)), ...] # => # [(year, [keysentence, num_keysentences]), ...] result = matching_sentences\ .reduceByKey(add)\ .map(lambda yearsentence_count: (yearsentence_count[0][0], (yearsentence_count[0][1], yearsentence_count[1]))) \ .groupByKey() \ .map(lambda year_sentencecount: (year_sentencecount[0], list(year_sentencecount[1]))) \ .collect() return result
def do_query(archives, config_file=None, logger=None, context=None): """ Counts number of occurrences of keywords or keysentences and groups by year. config_file must be the path to a configuration file with a list of the keywords to search for, one per line. Both keywords/keysentences and words in documents are normalized, by removing all non-'a-z|A-Z' characters. Returns result of form: { <YEAR>: [ [<SENTENCE|WORD>, <NUM_SENTENCES|WORDS>], ... ], <YEAR>: ... } :param archives: RDD of defoe.nls.archive.Archive :type archives: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: number of occurrences of keywords grouped by year :rtype: dict """ with open(config_file, "r") as f: config = yaml.load(f) preprocess_type = query_utils.extract_preprocess_word_type(config) data_file = query_utils.extract_data_file(config, os.path.dirname(config_file)) keysentences = [] with open(data_file, 'r') as f: for keysentence in list(f): k_split = keysentence.split() sentence_word = [ query_utils.preprocess_word(word, preprocess_type) for word in k_split ] sentence_norm = '' for word in sentence_word: if sentence_norm == '': sentence_norm = word else: sentence_norm += " " + word keysentences.append(sentence_norm) # [(year, document), ...] documents = archives.flatMap(lambda archive: [(document.year, document) for document in list(archive)]) # [(year, page_string) clean_pages = documents.flatMap(lambda year_document: [(year_document[ 0], clean_page_as_string(page)) for page in year_document[1]]) pages = clean_pages.flatMap(lambda cl_page: [(cl_page[ 0], preprocess_clean_page(cl_page[1], preprocess_type))]) # [(year, page_string) # [(year, page_string) filter_pages = pages.filter(lambda year_page: any( keysentence in year_page[1] for keysentence in keysentences)) # [(year, [keysentence, keysentence]), ...] matching_pages = filter_pages.map(lambda year_page: (year_page[ 0], get_sentences_list_matches(year_page[1], keysentences))) # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...] matching_sentences = matching_pages.flatMap(lambda year_sentence: [( (year_sentence[0], sentence), 1) for sentence in year_sentence[1]]) # [((year, keysentence), num_keysentences), ...] # => # [(year, (keysentence, num_keysentences)), ...] # => # [(year, [keysentence, num_keysentences]), ...] result = matching_sentences\ .reduceByKey(add)\ .map(lambda yearsentence_count: (yearsentence_count[0][0], (yearsentence_count[0][1], yearsentence_count[1]))) \ .groupByKey() \ .map(lambda year_sentencecount: (year_sentencecount[0], list(year_sentencecount[1]))) \ .collect() return result
def do_query(archives, config_file=None, logger=None, context=None): """ The query counts as a “hint” every time that finds a term from our lexicon and group the results by books. config_file must be the path to a lexicon file with a list of the keywords to search for, one per line. Also the config_file can indicate the preprocess treatment, along with the defoe path, and the type of operating system. - '''Twas on the morn of sweet May Day': - - neu - 1 - - blaw - 5 That means that neu appears in once of the book 'Twas on the morn of sweet May Day'. And blaw appears 5 times in the same book. :param archives: RDD of defoe.nls.archive.Archive :type archives: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: number of occurrences of keywords grouped bytitle :rtype: dict """ with open(config_file, "r") as f: config = yaml.load(f) if "os_type" in config: if config["os_type"] == "linux": os_type = "sys-i386-64" else: os_type = "sys-i386-snow-leopard" else: os_type = "sys-i386-64" if "defoe_path" in config: defoe_path = config["defoe_path"] else: defoe_path = "./" preprocess_type = query_utils.extract_preprocess_word_type(config) data_file = query_utils.extract_data_file(config, os.path.dirname(config_file)) keysentences = [] with open(data_file, 'r') as f: for keysentence in list(f): k_split = keysentence.split() sentence_word = [ query_utils.preprocess_word(word, preprocess_type) for word in k_split ] sentence_norm = '' for word in sentence_word: if sentence_norm == '': sentence_norm = word else: sentence_norm += " " + word keysentences.append(sentence_norm) # [(year, document), ...] documents = archives.flatMap(lambda archive: [( document.title, document) for document in list(archive)]) # [(year, page_string) clean_pages = documents.flatMap(lambda title_document: [(title_document[ 0], clean_page_as_string(page, defoe_path, os_type)) for page in title_document[1]]) pages = clean_pages.flatMap(lambda cl_page: [(cl_page[ 0], preprocess_clean_page(cl_page[1], preprocess_type))]) # [(year, page_string) # [(year, page_string) filter_pages = pages.filter(lambda title_page: any( keysentence in title_page[1] for keysentence in keysentences)) # [(year, [keysentence, keysentence]), ...] matching_pages = filter_pages.map(lambda title_page: (title_page[ 0], get_sentences_list_matches_per_page(title_page[1], keysentences))) # [[(year, keysentence), 1) ((year, keysentence), 1) ] ...] matching_sentences = matching_pages.flatMap(lambda title_sentence: [( (title_sentence[0], sentence), 1) for sentence in title_sentence[1]]) # [((year, keysentence), num_keysentences), ...] # => # [(year, (keysentence, num_keysentences)), ...] # => # [(year, [keysentence, num_keysentences]), ...] result = matching_sentences\ .reduceByKey(add)\ .map(lambda titlesentence_count: (titlesentence_count[0][0], (titlesentence_count[0][1], titlesentence_count[1]))) \ .groupByKey() \ .map(lambda title_sentencecount: (title_sentencecount[0], list(title_sentencecount[1]))) \ .collect() return result
def do_query(archives, config_file=None, logger=None, context=None): """ Gets concordance using a window of words (here it is configured to 40), for keywords and groups by date. Store the snippet (40 words before and after each term). config_file must be the path to a lexicon file with a list of the keywords to search for, one per line. Also the config_file can indicate the preprocess treatment, along with the defoe path, and the type of operating system. Returns result of form: { <YEAR>: [ [- archive_filename: - edition: - filename: - snippet: - term - title ] ... ], <YEAR>: ... } :param issues: RDD of defoe.alto.issue.Issue :type issues: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: information on documents in which keywords occur grouped by date :rtype: dict """ with open(config_file, "r") as f: config = yaml.load(f) if "os_type" in config: if config["os_type"] == "linux": os_type = "sys-i386-64" else: os_type = "sys-i386-snow-leopard" else: os_type = "sys-i386-64" if "defoe_path" in config: defoe_path = config["defoe_path"] else: defoe_path = "./" window = 40 preprocess_type = query_utils.extract_preprocess_word_type(config) preprocess_config = config["preprocess"] data_file = query_utils.extract_data_file(config, os.path.dirname(config_file)) keysentences = [] with open(data_file, 'r') as f: for keysentence in list(f): k_split = keysentence.split() sentence_word = [ query_utils.preprocess_word(word, preprocess_type) for word in k_split ] sentence_norm = '' for word in sentence_word: if sentence_norm == '': sentence_norm = word else: sentence_norm += " " + word keysentences.append(sentence_norm) # [(year, document, title, edition, archive_filename), ...] documents = archives.flatMap(lambda archive: [( document.year, document, document.title, document.edition, document. archive.filename) for document in list(archive)]) # [(year, page_string) #(year, title, edition, archive_filename, page_code, clean_page_string) clean_pages = documents.flatMap(lambda year_document: [(year_document[ 0], year_document[2], year_document[3], year_document[ 4], page.code, clean_page_as_string( page, defoe_path, os_type)) for page in year_document[1]]) #(year, title, edition, archive_filename, page_code, preprocess_clean_page) pages = clean_pages.flatMap(lambda cl_page: [( cl_page[0], cl_page[1], cl_page[2], cl_page[3], cl_page[4], preprocess_clean_page(cl_page[5], preprocess_type))]) #(year, title, edition, archive_filename, page_code, preprocess_clean_page) filter_pages = pages.filter(lambda year_page: any( keysentence in year_page[5] for keysentence in keysentences)) # [(year, title, edition, archive_filename, filename, text, [(word, idx), (word, idx) ...]), ...] maching_idx = filter_pages.map(lambda year_page: ( (year_page[0], year_page[1], year_page[2], year_page[3], year_page[4], year_page[5], get_text_keysentence_idx(year_page[5], keysentences)))) # [(year, [(title, edition, archive_filename, filename, word, [concordance, ...]), ...])] concordance_words = maching_idx.flatMap( lambda year_idx: [ (year_idx[0], {"title":year_idx[1], "edition": year_idx[2], "archive_filename": year_idx[3], "filename":year_idx[4], "term": word_idx[0],\ "snippet": get_concordance_string(year_idx[5], word_idx[0], word_idx[1], window)})\ for word_idx in year_idx[6]]) result = concordance_words.groupByKey() \ .map(lambda year_match: (year_match[0], list(year_match[1]))) \ .collect() return result
def do_query(archives, config_file=None, logger=None, context=None): """ Gets the concordance (also called details) occurrences of keywords or keysentences and groups by year. The config_file must indicate the path to a lexicon file with a list of the keywords to search for, one per line. Also the config_file can indicate the preprocess treatment, along with the defoe path, and the type of operating system. Returns result of form: { <YEAR>: [ - [title: place: publisher: snippet: term: document_id: filenanme] - [] ... ], <YEAR>: ... } :param archives: RDD of defoe.nls.archive.Archive :type archives: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: number of occurrences of keywords grouped by year :rtype: dict """ with open(config_file, "r") as f: config = yaml.load(f) if "os_type" in config: if config["os_type"] == "linux": os_type = "sys-i386-64" else: os_type = "sys-i386-snow-leopard" else: os_type = "sys-i386-64" if "defoe_path" in config: defoe_path = config["defoe_path"] else: defoe_path = "./" preprocess_type = query_utils.extract_preprocess_word_type(config) data_file = query_utils.extract_data_file(config, os.path.dirname(config_file)) keysentences = [] with open(data_file, 'r') as f: for keysentence in list(f): k_split = keysentence.split() sentence_word = [ query_utils.preprocess_word(word, preprocess_type) for word in k_split ] sentence_norm = '' for word in sentence_word: if sentence_norm == '': sentence_norm = word else: sentence_norm += " " + word keysentences.append(sentence_norm) # [(year, document), ...] documents = archives.flatMap(lambda archive: [(document.year, document) for document in list(archive)]) # [(year, page_string) clean_pages = documents.flatMap( lambda year_document: [(year_document[0], year_document[ 1], page, clean_page_as_string(page, defoe_path, os_type)) for page in year_document[1]]) pages = clean_pages.flatMap(lambda cl_page: [(cl_page[0], cl_page[ 1], cl_page[2], preprocess_clean_page(cl_page[3], preprocess_type))]) # [(year, page_string) # [(year, page_string) filter_pages = pages.filter(lambda year_page: any( keysentence in year_page[1] for keysentence in keysentences)) # [(year, [keysentence, keysentence]), ...] matching_pages = filter_pages.map( lambda year_page: (year_page[0], year_page[1], year_page[ 2], get_sentences_list_matches(year_page[3], keysentences))) matching_sentences = matching_pages.flatMap( lambda year_sentence: [(year_sentence[0], year_sentence[1], year_sentence[2], sentence)\ for sentence in year_sentence[3]]) matching_data = matching_sentences.map( lambda page_data: (page_data[0], { "title": page_data[1].title, "place": page_data[1].place, "publisher": page_data[1].publisher, "page_number": page_data[2].code, "snippet": page_data[2].content, "term": page_data[3], "document_id": page_data[1].code, "filename": page_data[1].archive.filename })) result = matching_data \ .groupByKey() \ .map(lambda date_context: (date_context[0], list(date_context[1]))) \ .collect() return result