コード例 #1
0
    def load_publicationauthor(self,
                               preprocess=True,
                               columns=None,
                               isindict=None,
                               duplicate_subset=None,
                               duplicate_keep='last',
                               dropna=None,
                               show_progress=False):
        """
        Load the PublicationAuthor DataFrame from a preprocessed directory.  For DBLP, you must run preprocess before
        the dataframe is available for use.

        Parameters
        ----------
        preprocess : bool, default True, Optional
            Attempt to load from the preprocessed directory.

        columns : list, default None, Optional
            Load only this subset of columns

        isindict : dict, default None, Optional
            Dictionary of format {"ColumnName":"ListofValues"} where "ColumnName" is a data column
            and "ListofValues" is a sorted list of valid values.  A DataFrame only containing rows that appear in
            "ListofValues" will be returned.

        duplicate_subset : list, default None, Optional
            Drop any duplicate entries as specified by this subset of columns

        duplicate_keep : str, default 'last', Optional
            If duplicates are being dropped, keep the 'first' or 'last'
            (see `pandas.DataFram.drop_duplicates <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html>`_)

        dropna : list, default None, Optional
            Drop any NaN entries as specified by this subset of columns

        Returns
        -------
        DataFrame
            PublicationAuthor DataFrame.

        """
        if show_progress:
            show_progress = 'Loading PublicationAuthor'
        if preprocess and os.path.exists(
                os.path.join(self.path2database, 'publicationauthor')):
            return load_preprocessed_data('publicationauthor',
                                          path2database=self.path2database,
                                          columns=columns,
                                          isindict=isindict,
                                          duplicate_subset=duplicate_subset,
                                          duplicate_keep=duplicate_keep,
                                          dropna=dropna,
                                          show_progress=show_progress)
        else:
            raise NotImplementedError(
                "DBLP is stored as a single xml file.  Run preprocess to parse the file."
            )
コード例 #2
0
ファイル: database.py プロジェクト: shouwangbuqi/pyscisci
    def load_impact(self, preprocess = True, include_yearnormed = True, columns = None, isindict = None, duplicate_subset = None,
        duplicate_keep = 'last', dropna = None, prefunc2apply=None, postfunc2apply=None, show_progress=False):
        """
        Load the precomputed impact DataFrame from a preprocessed directory.

        Parameters
        ----------
        :param preprocess : bool, default True
            Attempt to load from the preprocessed directory.

        :param include_yearnormed: bool, default True
            Normalize all columns by yearly average.

        :param columns : list, default None
            Load only this subset of columns

        :param isindict : dict, default None, Optional
            Dictionary of format {"ColumnName":"ListofValues"} where "ColumnName" is a data column
            and "ListofValues" is a sorted list of valid values.  A DataFrame only containing rows that appear in
            "ListofValues" will be returned.

        :param duplicate_subset : list, default None, Optional
            Drop any duplicate entries as specified by this subset of columns

        :param duplicate_keep : str, default 'last', Optional
            If duplicates are being dropped, keep the 'first' or 'last'
            (see `pandas.DataFram.drop_duplicates <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html>`_)

        :param dropna : list, default None, Optional
            Drop any NaN entries as specified by this subset of columns

        Returns
        -------
        DataFrame
            FieldInformation DataFrame.

        """
        if show_progress:
            show_progress='Loading Impact'

        if include_yearnormed:
            def normfunc(impactdf):
                impactcolumns = [c for c in list(impactdf) if not c in ['PublicationId', 'Year']]
                for c in impactcolumns:
                    impactdf[c+'_norm'] = impactdf[c]/impactdf[c].mean()
                return impactdf
        else:
            def normfunc(impactdf):
                return impactdf

        if preprocess and os.path.exists(os.path.join(self.path2database, 'impact')):
            return load_preprocessed_data('impact', path2database=self.path2database, columns=columns,
                isindict=isindict, duplicate_subset=duplicate_subset, duplicate_keep=duplicate_keep, dropna=dropna,
                prefunc2apply=normfunc, show_progress=show_progress)
        else:
            raise self.compute_impact()
コード例 #3
0
ファイル: database.py プロジェクト: shouwangbuqi/pyscisci
    def load_references(self, preprocess = True, columns = None, isindict = None, duplicate_subset = None,
        duplicate_keep = 'last', noselfcite = False, dropna = None, prefunc2apply=None, postfunc2apply=None, show_progress=False):
        """
        Load the Pub2Ref DataFrame from a preprocessed directory, or parse from the raw files.

        Parameters
        ----------
        preprocess : bool, default True, Optional
            Attempt to load from the preprocessed directory.

        columns : list, default None, Optional
            Load only this subset of columns

        isindict : dict, default None, Optional
            Dictionary of format {"ColumnName":"ListofValues"} where "ColumnName" is a data column
            and "ListofValues" is a sorted list of valid values.  A DataFrame only containing rows that appear in
            "ListofValues" will be returned.

        duplicate_subset : list, default None, Optional
            Drop any duplicate entries as specified by this subset of columns

        duplicate_keep : str, default 'last', Optional
            If duplicates are being dropped, keep the 'first' or 'last'
            (see `pandas.DataFram.drop_duplicates <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html>`_)

        dropna : list, default None, Optional
            Drop any NaN entries as specified by this subset of columns

        noselfcite : bool, default False, Optional
            If True, then the preprocessed pub2ref files with self-citations removed will be used.

        Returns
        -------
        DataFrame
            Pub2Ref DataFrame.

        """
        if noselfcite:
            fileprefix = 'pub2refnoself'
        else:
            fileprefix = 'pub2ref'

        if show_progress:
            show_progress='Loading {}'.format(fileprefix)

        if preprocess and os.path.exists(os.path.join(self.path2database, fileprefix)):
            return load_preprocessed_data(fileprefix, path2database=self.path2database, columns=columns,
                isindict=isindict, duplicate_subset=duplicate_subset, duplicate_keep=duplicate_keep, dropna=dropna,
                prefunc2apply=prefunc2apply, postfunc2apply=postfunc2apply, show_progress=show_progress)
        else:
            return self.parse_references()
コード例 #4
0
ファイル: database.py プロジェクト: shouwangbuqi/pyscisci
    def load_authors(self, preprocess = True, columns = None, isindict = None, duplicate_subset = None,
        duplicate_keep = 'last', dropna = None, prefunc2apply=None, postfunc2apply=None, process_name = True, show_progress=True):
        """
        Load the Author DataFrame from a preprocessed directory, or parse from the raw files.

        Parameters
        ----------
        preprocess : bool, default True, Optional
            Attempt to load from the preprocessed directory.

        columns : list, default None, Optional
            Load only this subset of columns

        isindict : dict, default None, Optional
            Dictionary of format {"ColumnName":"ListofValues"} where "ColumnName" is a data column
            and "ListofValues" is a sorted list of valid values.  A DataFrame only containing rows that appear in
            "ListofValues" will be returned.

        duplicate_subset : list, default None, Optional
            Drop any duplicate entries as specified by this subset of columns

        duplicate_keep : str, default 'last', Optional
            If duplicates are being dropped, keep the 'first' or 'last'
            (see `pandas.DataFram.drop_duplicates <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html>`_)

        dropna : list, default None, Optional
            Drop any NaN entries as specified by this subset of columns

        process_name : bool, default True, Optional
            If True, then when processing the raw file, the package `NameParser <https://nameparser.readthedocs.io/en/latest/>`_
            will be used to split author FullNames.

        Returns
        -------
        DataFrame
            Author DataFrame.

        """
        if show_progress:
            show_progress='Loading Authors'

        if preprocess and os.path.exists(os.path.join(self.path2database, 'author')):
            return load_preprocessed_data('author', path2database=self.path2database, columns=columns,
                isindict=isindict, duplicate_subset=duplicate_subset, duplicate_keep=duplicate_keep, dropna=dropna,
                prefunc2apply=prefunc2apply, postfunc2apply=postfunc2apply, show_progress=show_progress)
        else:
            return self.parse_authors(process_name=process_name)
コード例 #5
0
ファイル: database.py プロジェクト: shouwangbuqi/pyscisci
    def load_pub2field(self, preprocess = True, columns = None, isindict = None, duplicate_subset = None,
        duplicate_keep = 'last', dropna = None, prefunc2apply=None, postfunc2apply=None, show_progress=False):
        """
        Load the Pub2Field DataFrame from a preprocessed directory, or parse from the raw files.

        Parameters
        ----------
        :param preprocess : bool, default True, Optional
            Attempt to load from the preprocessed directory.

        :param columns : list, default None, Optional
            Load only this subset of columns

        :param isindict : dict, default None, Optional
            Dictionary of format {"ColumnName":"ListofValues"} where "ColumnName" is a data column
            and "ListofValues" is a sorted list of valid values.  A DataFrame only containing rows that appear in
            "ListofValues" will be returned.

        :param duplicate_subset : list, default None, Optional
            Drop any duplicate entries as specified by this subset of columns

        :param duplicate_keep : str, default 'last', Optional
            If duplicates are being dropped, keep the 'first' or 'last'
            (see `pandas.DataFram.drop_duplicates <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html>`_)

        :param dropna : list, default None, Optional
            Drop any NaN entries as specified by this subset of columns

        Returns
        -------
        DataFrame
            Pub2Field DataFrame.

        """
        if show_progress:
            show_progress='Loading Fields'

        if preprocess and os.path.exists(os.path.join(self.path2database, 'pub2field')):
            return load_preprocessed_data('pub2field', path2database=self.path2database, columns=columns,
                isindict=isindict, duplicate_subset=duplicate_subset, duplicate_keep=duplicate_keep, dropna=dropna,
                prefunc2apply=prefunc2apply, postfunc2apply=postfunc2apply, show_progress=show_progress)
        else:
            return self.parse_fields()