def test_chunk_json_lines_with_remainder(fs): out_file_path = "wordforms" out_file = open(out_file_path, 'w') write_json_lines(out_file, iterate_wf(["wf1", "wf2", "wf3", "wf4", "wf5"])) out_file.close() out_file = open(out_file_path, 'r') res = list(chunk_json_lines(out_file, 2)) out_file.close() outp = [ [{ "wordform": "wf1" }, { "wordform": "wf2" }], [{ "wordform": "wf3" }, { "wordform": "wf4" }], [{ "wordform": "wf5" }], ] assert outp == res
def connect_anahashes_to_wordforms(session, anahashes, df, batch_size=50000): """ Create the relation between wordforms and anahashes in the database. Given `anahashes`, a dataframe with wordforms and corresponding anahashes, create the relations between the two in the wordforms and anahashes tables by setting the anahash_id foreign key in the wordforms table. """ LOGGER.info('Connecting anahashes to wordforms.') LOGGER.debug('Getting wordform/anahash_id pairs.') with get_temp_file() as anahash_to_wf_file: total_lines_written = write_json_lines( anahash_to_wf_file, get_anahashes(session, anahashes, df)) update_statement = Wordform.__table__.update(). \ where(Wordform.wordform_id == bindparam('wf_id')). \ values(anahash_id=bindparam('a_id')) LOGGER.debug('Adding the connections wordform -> anahash_id.') sql_query_batches(session, update_statement, read_json_lines(anahash_to_wf_file), total_lines_written, batch_size) LOGGER.info('Added the anahash of %s wordforms.', total_lines_written) return total_lines_written
def test_read_and_write_json_lines_empty(fs): objects = [] fname = 'objects' f = open(fname, 'w') write_json_lines(f, objects) f.close() f = open(fname, 'r') assert os.path.exists(fname) assert os.path.getsize(fname) == 0 results = [o for o in read_json_lines(f)] f.close() assert objects == results
def test_chunk_json_lines_without_remainder(fs): out_file_path = "wordforms" out_file = open(out_file_path, 'w') write_json_lines(out_file, iterate_wf(["wf1", "wf2", "wf3", "wf4"])) out_file.close() out_file = open(out_file_path, 'r') res = list(chunk_json_lines(out_file, 2)) out_file.close() outp = [[{ 'wordform': 'wf1' }, { 'wordform': 'wf2' }], [{ 'wordform': 'wf3' }, { 'wordform': 'wf4' }]] assert outp == res
def add_morphological_paradigms(session, in_file): """ Add morphological paradigms to database from CSV file. """ data = pd.read_csv(in_file, sep='\t', index_col=False, names=[ 'wordform', 'corpus_freq', 'component_codes', 'human_readable_c_code', 'first_year', 'last_year', 'dict_ids', 'pos_tags', 'int_ids' ]) # drop first row (contains empty wordform) data = data.drop([0]) # store wordforms for in database wfs = data[['wordform']].copy() bulk_add_wordforms(session, wfs) # get the morphological variants from the pandas dataframe LOGGER.info('extracting morphological variants') morph_paradigms_per_wordform = defaultdict(list) with tqdm(total=data.shape[0]) as pbar: for row in data.iterrows(): codes = row[1]['component_codes'].split('#') wordform = row[1]['wordform'] for code in codes: morph_paradigms_per_wordform[wordform].append( split_component_code(code, wordform)) pbar.update() LOGGER.info('Looking up wordform ids.') select_statement = select([Wordform ]).where(Wordform.wordform.in_(wfs['wordform'])) mapping = session.execute(select_statement).fetchall() LOGGER.info('Writing morphological variants to file.') with get_temp_file() as mp_file: total_lines_written = write_json_lines( mp_file, morph_iterator(morph_paradigms_per_wordform, mapping)) LOGGER.info('Wrote %s morphological variants.', total_lines_written) LOGGER.info('Inserting morphological variants to the database.') sql_insert_batches(session, MorphologicalParadigm, read_json_lines(mp_file), batch_size=50000)
def test_read_and_write_json_lines(fs): objects = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}, {'a': 5, 'b': 6}] fname = 'objects' f = open(fname, 'w') total = write_json_lines(f, objects) f.close() f = open(fname, 'r') assert os.path.exists(fname) assert total == len(objects) results = [o for o in read_json_lines(f)] f.close() assert objects == results
def add_corpus_core(session, corpus_matrix, vectorizer, corpus_name, document_metadata=pd.DataFrame(), batch_size=50000): """ Add a corpus to the database. A corpus is a collection of documents, which is a collection of words. This function adds all words as wordforms to the database, records their "attestation" (the fact that they occur in a certain document and with what frequency), adds the documents they belong to, adds the corpus and adds the corpus ID to the documents. Inputs: session: SQLAlchemy session (e.g. from `dbutils.get_session`) corpus_matrix: the dense corpus term-document matrix, like from `tokenize.terms_documents_matrix_ticcl_frequency` vectorizer: the terms in the term-document matrix, as given by `tokenize.terms_documents_matrix_ticcl_frequency` corpus_name: the name of the corpus in the database document_metadata: see `ticclat_schema.Document` for all the possible metadata. Make sure the index of this dataframe matches with the document identifiers in the term- document matrix, which can be easily achieved by resetting the index for a Pandas dataframe. batch_size: batch handling of wordforms to avoid memory issues. """ with get_temp_file() as wf_file: write_json_lines(wf_file, iterate_wf(vectorizer.vocabulary_)) # Prepare the documents to be added to the database LOGGER.info('Creating document data') corpus_csr = scipy.sparse.csr_matrix(corpus_matrix) word_counts = corpus_csr.sum(axis=1) # sum the rows wc_list = np.array(word_counts).flatten().tolist() document_metadata['word_count'] = wc_list # Determine which wordforms in the vocabulary need to be added to the # database LOGGER.info('Determine which wordforms need to be added') with get_temp_file() as wf_to_add_file: with tqdm(total=count_lines(wf_file)) as pbar: for chunk in chunk_json_lines(wf_file, batch_size=batch_size): # Find out which wordwordforms are not yet in the database wordforms = {wf['wordform'] for wf in chunk} select_statement = select([Wordform]).where( Wordform.wordform.in_(wordforms)) result = session.execute(select_statement).fetchall() # wf: (id, wordform, anahash_id, wordform_lowercase) existing_wfs = {wf[1] for wf in result} for wordform in wordforms.difference(existing_wfs): wf_to_add_file.write( json.dumps({ 'wordform': wordform, 'wordform_lowercase': wordform.lower() })) wf_to_add_file.write('\n') pbar.update(batch_size) # Create the corpus (in a session) and get the ID LOGGER.info('Creating the corpus') corpus = Corpus(name=corpus_name) session.add(corpus) # add the documents using ORM, because we need to link them to the # corpus LOGGER.info('Adding the documents') for doc in document_metadata.to_dict(orient='records'): document_obj = Document(**doc) document_obj.document_corpora.append(corpus) session.flush() corpus_id = corpus.corpus_id # Insert the wordforms that need to be added using SQLAlchemy core (much # faster than using the ORM) LOGGER.info('Adding the wordforms') bulk_add_wordforms_core(session, read_json_lines(wf_to_add_file)) LOGGER.info('Prepare adding the text attestations') # make a mapping from df = pd.DataFrame.from_dict(vectorizer.vocabulary_, orient='index') df = df.reset_index() LOGGER.info('\tGetting the wordform ids') wf_mapping = {} for chunk in chunk_df(df, batch_size=batch_size): to_select = list(chunk['index']) select_statement = select([Wordform ]).where(Wordform.wordform.in_(to_select)) result = session.execute(select_statement).fetchall() for wordform in result: # wordform: (id, wordform, anahash_id, wordform_lowercase) wf_mapping[wordform[1]] = wordform[0] LOGGER.info('\tGetting the document ids') # get doc_ids select_statement = select([corpusId_x_documentId.join(Corpus).join(Document)]) \ .where(Corpus.corpus_id == corpus_id).order_by(Document.document_id) result = session.execute(select_statement).fetchall() # row: (corpus_id, document_id, ...) doc_ids = [row[1] for row in result] LOGGER.info('\tReversing the mapping') # reverse mapping from wordform to id in the terms/document matrix word_from_tdmatrix_id = dict( zip(vectorizer.vocabulary_.values(), vectorizer.vocabulary_.keys())) LOGGER.info('\tGetting the text attestations') with get_temp_file() as ta_file: write_json_lines( ta_file, get_tas(corpus_matrix, doc_ids, wf_mapping, word_from_tdmatrix_id)) LOGGER.info('Adding the text attestations') total = count_lines(ta_file) bulk_add_textattestations_core(session, read_json_lines(ta_file), total=total, batch_size=batch_size)