def store_table(self, name, description, csv_path, csv_mapping): """ Read CSV file line by line and store the data in HDF5 data store :param name: name of the table :param description: instance of tables.IsDescription, class describing the columns of the table (number, data types, etc.) :param csv_path: path to the input CSV file :param csv_mapping: Enum instance containing mapping of the HDF5 table description columns to the CSV columns (for each column in 'description' this should contain index of the column in the input CSV) :return: how many rows were stored in the datastore """ msg = ("The descriptor parameter has to be instance " "of tables.IsDescription class") assert (issubclass(description, tables.IsDescription)), msg msg = "No intersection between HDF5 description and CSV mapping" assert (len( set([i.name for i in csv_mapping]).intersection( description.columns)) > 0), msg row_index = 0 self.logger.debug('Checking the number of rows to be stored') total = wsdmlog.get_total(csv_path) self.logger.debug('Total: %s', total) how_often = wsdmlog.how_often(total) with tables.open_file(self.datastore_path, 'a') as ds: # first remove old node self._remove_node(ds, name) # then create again table = ds.create_table(ds.root, name, description=description, expectedrows=total) self.logger.debug('Created table %s', name) hdf_row = table.row # iterate over csv and write it in the table line by line csv_datastore = CsvDatastore() for csv_row in csv_datastore.read_csv(csv_path): for column in description.columns: if hasattr(csv_mapping, column): csv_col_index = getattr(csv_mapping, column).value hdf_row[column] = str.encode(csv_row[csv_col_index]) elif column.endswith('_index'): hdf_row[column] = row_index hdf_row.append() table.flush() row_index += 1 if row_index % how_often == 0: self.logger.debug(wsdmlog.get_progress(row_index, total)) return row_index
def store_table(self, name, description, csv_path, csv_mapping): """ Read CSV file line by line and store the data in HDF5 data store :param name: name of the table :param description: instance of tables.IsDescription, class describing the columns of the table (number, data types, etc.) :param csv_path: path to the input CSV file :param csv_mapping: Enum instance containing mapping of the HDF5 table description columns to the CSV columns (for each column in 'description' this should contain index of the column in the input CSV) :return: how many rows were stored in the datastore """ msg = ("The descriptor parameter has to be instance " "of tables.IsDescription class") assert(issubclass(description, tables.IsDescription)), msg msg = "No intersection between HDF5 description and CSV mapping" assert(len(set([i.name for i in csv_mapping]) .intersection(description.columns)) > 0), msg row_index = 0 self.logger.debug('Checking the number of rows to be stored') total = wsdmlog.get_total(csv_path) self.logger.debug('Total: %s', total) how_often = wsdmlog.how_often(total) with tables.open_file(self.datastore_path, 'a') as ds: # first remove old node self._remove_node(ds, name) # then create again table = ds.create_table(ds.root, name, description=description, expectedrows=total) self.logger.debug('Created table %s', name) hdf_row = table.row # iterate over csv and write it in the table line by line csv_datastore = CsvDatastore() for csv_row in csv_datastore.read_csv(csv_path): for column in description.columns: if hasattr(csv_mapping, column): csv_col_index = getattr(csv_mapping, column).value hdf_row[column] = str.encode(csv_row[csv_col_index]) elif column.endswith('_index'): hdf_row[column] = row_index hdf_row.append() table.flush() row_index += 1 if row_index % how_often == 0: self.logger.debug(wsdmlog.get_progress(row_index, total)) return row_index
def load_paper_journal_matrix(self, papers, journals): """ :param papers: dictionary of {id: index} :param journals: dictionary of {id: index} :return: """ fpath = Config.get_path_to_data_file('Papers.txt') return CsvDatastore().csv_to_relation_matrix( fpath, PapersCsv.paper_id.value, papers, PapersCsv.journal_id.value, journals)
def load_paper_field_of_study_matrix(self, papers, fos): """ :param papers: dictionary of {id: index} :param fos: fields of study, dictionary of {id: index} :return: """ fpath = Config.get_path_to_data_file('PaperKeywords.txt') return CsvDatastore().csv_to_relation_matrix( fpath, PaperKeywordsCsv.paper_id.value, papers, PaperKeywordsCsv.field_id.value, fos)
def load_paper_conf_series_matrix(self, papers, conf_series): """ :param papers: dictionary of {id: index} :param conf_series: dictionary of {id: index} :return: """ fpath = Config.get_path_to_data_file('Papers.txt') return CsvDatastore().csv_to_relation_matrix( fpath, PapersCsv.paper_id.value, papers, PapersCsv.conference_series_id.value, conf_series)
def load_author_sequence_matrix(self, papers, authors): """ :param papers: dictionary of {id: index} :param authors: dictionary of {id: index} :return: """ fpath = Config.get_path_to_data_file('PaperAuthorAffiliations.txt') return CsvDatastore().csv_to_relation_matrix( fpath, PapAuthAff.paper_id.value, papers, PapAuthAff.author_id.value, authors, PapAuthAff.author_seq_number.value)
def load_paper_affiliation_matrix(self, papers, affiliations): """ Build matrix of papers and affiliations from list of paper-affiliation relations in PaperAuthorAffiliations.txt file :param papers: dictionary of {id: index} :param affiliations: dictionary of {id: index} :return: scipy.sparse.csr_matrix """ fpath = Config.get_path_to_data_file('PaperAuthorAffiliations.txt') return CsvDatastore().csv_to_relation_matrix( fpath, PapAuthAff.paper_id.value, papers, PapAuthAff.affiliation_id.value, affiliations)
def load_citation_matrix(self, papers): """ Build adjacency matrix from list of edges in PaperReferences.txt file :param papers: dictionary of {id: index} :return: scipy.sparse.csr_matrix """ fpath = Config.get_path_to_data_file('PaperReferences.txt') return CsvDatastore().csv_to_relation_matrix(fpath, PapRef.paper_id.value, papers, PapRef.reference_id.value, papers)
def output_results(df, columns): """ :param df: pandas.DataFrame :param columns: columns with paper_id and results :return: None """ logger = logging.getLogger(__name__) results_path = Config.get_next_results_file_path() upload_path = Config.get_results_upload_path() logger.info('Storing results in a CSV %s', results_path) CsvDatastore().store_results(df, results_path, columns) logger.info('Copying results to the upload file %s', upload_path) shutil.copyfile(results_path, upload_path) return