Ejemplo n.º 1
0
    def store_table(self, name, description, csv_path, csv_mapping):
        """
        Read CSV file line by line and store the data in HDF5 data store
        :param name: name of the table
        :param description: instance of tables.IsDescription, class describing
                            the columns of the table (number, data types, etc.)
        :param csv_path: path to the input CSV file
        :param csv_mapping: Enum instance containing mapping of the HDF5
                            table description columns to the CSV columns
                            (for each column in 'description' this should
                            contain index of the column in the input CSV)
        :return: how many rows were stored in the datastore
        """
        msg = ("The descriptor parameter has to be instance "
               "of tables.IsDescription class")
        assert (issubclass(description, tables.IsDescription)), msg

        msg = "No intersection between HDF5 description and CSV mapping"
        assert (len(
            set([i.name for i in csv_mapping]).intersection(
                description.columns)) > 0), msg

        row_index = 0
        self.logger.debug('Checking the number of rows to be stored')
        total = wsdmlog.get_total(csv_path)
        self.logger.debug('Total: %s', total)
        how_often = wsdmlog.how_often(total)

        with tables.open_file(self.datastore_path, 'a') as ds:
            # first remove old node
            self._remove_node(ds, name)
            # then create again
            table = ds.create_table(ds.root,
                                    name,
                                    description=description,
                                    expectedrows=total)
            self.logger.debug('Created table %s', name)
            hdf_row = table.row
            # iterate over csv and write it in the table line by line
            csv_datastore = CsvDatastore()
            for csv_row in csv_datastore.read_csv(csv_path):
                for column in description.columns:
                    if hasattr(csv_mapping, column):
                        csv_col_index = getattr(csv_mapping, column).value
                        hdf_row[column] = str.encode(csv_row[csv_col_index])
                    elif column.endswith('_index'):
                        hdf_row[column] = row_index
                hdf_row.append()
                table.flush()
                row_index += 1
                if row_index % how_often == 0:
                    self.logger.debug(wsdmlog.get_progress(row_index, total))

        return row_index
Ejemplo n.º 2
0
    def store_table(self, name, description, csv_path, csv_mapping):
        """
        Read CSV file line by line and store the data in HDF5 data store
        :param name: name of the table
        :param description: instance of tables.IsDescription, class describing
                            the columns of the table (number, data types, etc.)
        :param csv_path: path to the input CSV file
        :param csv_mapping: Enum instance containing mapping of the HDF5
                            table description columns to the CSV columns
                            (for each column in 'description' this should
                            contain index of the column in the input CSV)
        :return: how many rows were stored in the datastore
        """
        msg = ("The descriptor parameter has to be instance "
               "of tables.IsDescription class")
        assert(issubclass(description, tables.IsDescription)), msg

        msg = "No intersection between HDF5 description and CSV mapping"
        assert(len(set([i.name for i in csv_mapping])
                   .intersection(description.columns)) > 0), msg

        row_index = 0
        self.logger.debug('Checking the number of rows to be stored')
        total = wsdmlog.get_total(csv_path)
        self.logger.debug('Total: %s', total)
        how_often = wsdmlog.how_often(total)

        with tables.open_file(self.datastore_path, 'a') as ds:
            # first remove old node
            self._remove_node(ds, name)
            # then create again
            table = ds.create_table(ds.root, name,
                                    description=description,
                                    expectedrows=total)
            self.logger.debug('Created table %s', name)
            hdf_row = table.row
            # iterate over csv and write it in the table line by line
            csv_datastore = CsvDatastore()
            for csv_row in csv_datastore.read_csv(csv_path):
                for column in description.columns:
                    if hasattr(csv_mapping, column):
                        csv_col_index = getattr(csv_mapping, column).value
                        hdf_row[column] = str.encode(csv_row[csv_col_index])
                    elif column.endswith('_index'):
                        hdf_row[column] = row_index
                hdf_row.append()
                table.flush()
                row_index += 1
                if row_index % how_often == 0:
                    self.logger.debug(wsdmlog.get_progress(row_index, total))

        return row_index
Ejemplo n.º 3
0
 def load_paper_journal_matrix(self, papers, journals):
     """
     :param papers: dictionary of {id: index}
     :param journals: dictionary of {id: index}
     :return:
     """
     fpath = Config.get_path_to_data_file('Papers.txt')
     return CsvDatastore().csv_to_relation_matrix(
         fpath, PapersCsv.paper_id.value, papers,
         PapersCsv.journal_id.value, journals)
Ejemplo n.º 4
0
 def load_paper_field_of_study_matrix(self, papers, fos):
     """
     :param papers: dictionary of {id: index}
     :param fos: fields of study, dictionary of {id: index}
     :return:
     """
     fpath = Config.get_path_to_data_file('PaperKeywords.txt')
     return CsvDatastore().csv_to_relation_matrix(
         fpath, PaperKeywordsCsv.paper_id.value, papers,
         PaperKeywordsCsv.field_id.value, fos)
Ejemplo n.º 5
0
 def load_paper_conf_series_matrix(self, papers, conf_series):
     """
     :param papers: dictionary of {id: index}
     :param conf_series: dictionary of {id: index}
     :return:
     """
     fpath = Config.get_path_to_data_file('Papers.txt')
     return CsvDatastore().csv_to_relation_matrix(
         fpath, PapersCsv.paper_id.value, papers,
         PapersCsv.conference_series_id.value, conf_series)
Ejemplo n.º 6
0
 def load_author_sequence_matrix(self, papers, authors):
     """
     :param papers: dictionary of {id: index}
     :param authors: dictionary of {id: index}
     :return:
     """
     fpath = Config.get_path_to_data_file('PaperAuthorAffiliations.txt')
     return CsvDatastore().csv_to_relation_matrix(
         fpath, PapAuthAff.paper_id.value, papers,
         PapAuthAff.author_id.value, authors,
         PapAuthAff.author_seq_number.value)
Ejemplo n.º 7
0
 def load_paper_affiliation_matrix(self, papers, affiliations):
     """
     Build matrix of papers and affiliations from list of
     paper-affiliation relations in PaperAuthorAffiliations.txt file
     :param papers: dictionary of {id: index}
     :param affiliations: dictionary of {id: index}
     :return: scipy.sparse.csr_matrix
     """
     fpath = Config.get_path_to_data_file('PaperAuthorAffiliations.txt')
     return CsvDatastore().csv_to_relation_matrix(
         fpath, PapAuthAff.paper_id.value, papers,
         PapAuthAff.affiliation_id.value, affiliations)
Ejemplo n.º 8
0
 def load_citation_matrix(self, papers):
     """
     Build adjacency matrix from list of edges in PaperReferences.txt file
     :param papers: dictionary of {id: index}
     :return: scipy.sparse.csr_matrix
     """
     fpath = Config.get_path_to_data_file('PaperReferences.txt')
     return CsvDatastore().csv_to_relation_matrix(fpath,
                                                  PapRef.paper_id.value,
                                                  papers,
                                                  PapRef.reference_id.value,
                                                  papers)
Ejemplo n.º 9
0
def output_results(df, columns):
    """
    :param df: pandas.DataFrame
    :param columns: columns with paper_id and results
    :return: None
    """
    logger = logging.getLogger(__name__)
    results_path = Config.get_next_results_file_path()
    upload_path = Config.get_results_upload_path()
    logger.info('Storing results in a CSV %s', results_path)
    CsvDatastore().store_results(df, results_path, columns)
    logger.info('Copying results to the upload file %s', upload_path)
    shutil.copyfile(results_path, upload_path)
    return