Exemple #1
0
def groupby_range(df, colgroupby, colrange, show_progress=False):
    """
   Group the DataFrame and find the range between the smallest and largest value for each group.

    Parameters
    ----------
    :param df: DataFrame
        The DataFrame.

    :param colgroupby: str
        The column to groupby.

    :param colrange: str
        The column to find the range of values.

    :param show_progress: bool or str, default False
        If True, display a progress bar for the range.  If str, the name of the progress bar to display.

    Returns
    ----------
    DataFrame
        DataFrame with two columns: colgroupby, colrange+`Range`
    """
    desc = ''
    if isinstance(show_progress, str):
        desc = show_progress
    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc=desc, disable=not show_progress)

    newname_dict = zip2dict([str(colrange), '0'],
                            [str(colrange) + 'Range'] * 2)
    return df.groupby(colgroupby, sort=False)[colrange].progress_apply(
        lambda x: x.max() - x.min()).to_frame().reset_index().rename(
            columns=newname_dict)
Exemple #2
0
def compute_hindex(df, colgroupby, colcountby, show_progress=False):
    """
    Calculate the h index for each group in the DataFrame.  See :cite:`hirsch2005index` for the definition.

    The algorithmic implementation for each author can be found in :py:func:`citationanalysis.author_hindex`.

    Parameters
    ----------
    :param df : DataFrame
        A DataFrame with the citation information for each Author.

    :param colgroupby : str
        The DataFrame column with Author Ids.

    :param colcountby : str
        The DataFrame column with Citation counts for each publication.

    Returns
    -------
    DataFrame
        DataFrame with 2 columns: colgroupby, 'Hindex'

        """
    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc='Hindex', disable=not show_progress)

    newname_dict = zip2dict([str(colcountby), '0'],
                            [str(colgroupby) + 'Hindex'] * 2)
    return df.groupby(colgroupby, sort=False)[colcountby].progress_apply(
        author_hindex).to_frame().reset_index().rename(columns=newname_dict)
Exemple #3
0
def groupby_mean(df, colgroupby, colcountby, show_progress=False):
    """
    Group the DataFrame and find the mean of the column.

    Parameters
    ----------
    :param df: DataFrame
        The DataFrame.

    :param colgroupby: str
        The column to groupby.

    :param colcountby: str
        The column to find the mean of values.

    :param show_progress: bool or str, default False
        If True, display a progress bar for the summation.  If str, the name of the progress bar to display.

    Returns
    ----------
    DataFrame
        DataFrame with two columns: colgroupby, colcountby+'Mean'
    """
    desc = ''
    if isinstance(show_progress, str):
        desc = show_progress
    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc=desc, disable=not show_progress)

    newname_dict = zip2dict([str(colcountby), '0'],
                            [str(colcountby) + 'Mean'] * 2)
    return df.groupby(colgroupby, sort=False)[colrange].progress_apply(
        lambda x: x.mean()).to_frame().reset_index().rename(
            columns=newname_dict)
def compute_sleepingbeauty(df, colgroupby, colcountby, show_progress=False):
    """
    Calculate the sleeping beauty and awakening time for each group in the DataFrame.  See :cite:`ke2015beauty` for details.

    The algorithmic implementation for each publication can be found in :py:func:`sleepingbeauty.beauty_coefficient`.

    Parameters
    ----------
    df : DataFrame
        A DataFrame with the citation information for each Author.

    colgroupby : str
        The DataFrame column with Author Ids.

    colcountby : str
        The DataFrame column with Citation counts for each publication.

    Returns
    -------
    DataFrame
        DataFrame with 3 columns: colgroupby, 'Beauty' and 'Awakening'

        """
    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc='Beauty', disable=not show_progress)

    newname_dict = zip2dict([str(colcountby), '0', '1'],
                            [str(colgroupby) + 'Beauty'] * 2 + ['Awakening'])
    return df.groupby(colgroupby, sort=False)[colcountby].progress_apply(
        beauty_coefficient).to_frame().reset_index().rename(
            columns=newname_dict)
Exemple #5
0
def groupby_zero_col(df, colgroupby, colrange, show_progress=False):
    """
    Group the DataFrame and shift the column so the minimum value is 0.

    Parameters
    ----------
    :param df: DataFrame
        The DataFrame.

    :param colgroupby: str
        The column to groupby.

    :param colrange: str
        The column to find the range of values.

    :param show_progress: bool or str, default False
        If True, display a progress bar.  If str, the name of the progress bar to display.

    Returns
    ----------
    DataFrame
        DataFrame with two columns: colgroupby, colrange
    """
    desc = ''
    if isinstance(show_progress, str):
        desc = show_progress
    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc=desc, disable=not show_progress)

    return df.groupby(
        colgroupby,
        sort=False)[colrange].progress_transform(lambda x: x - x.min())
    def __init__(self, embedding_cache, max_length=512,
                 padding_side="right", pad_value=0, trunc_side="random"):
        assert Path(embedding_cache).exists(), "embedding cache file doesn't exist, need to run preprocessing.py"
        with open(embedding_cache, "rb") as pickle_in:
            print("loading the cache embedding data from pickle...")
            cache_df = pickle.load(pickle_in)

        # Adjust this part if want to try other emmbedding
        self.review_df = cache_df.loc[cache_df["contact_embed"] != '[]']

        partial_pad_trunc = partial(pad_trunc_sequences, max_length=max_length,
                                    padding_side=padding_side, pad_value=pad_value, trunc_side=trunc_side)

        tqdm.pandas(desc="Padding and truncating hmd and head customer embedding...")
        review_input_df = self.review_df["contact_embed"].progress_apply(lambda x: partial_pad_trunc(x))
        self.review_input_ids = np.array([item[0] for item in review_input_df.values], dtype=np.long)
        self.review_attention_mask = np.array([item[1] for item in review_input_df.values], dtype=np.bool)
        self.review_token_type_ids = np.array([item[2] for item in review_input_df.values], dtype=np.long)

        tqdm.pandas(desc="Padding and truncating agent embedding...")
        # change this part if want to use other embed
#         self.review_df['asic_sic_embed'] = self.review_df['asic_embed'] + self.review_df['sic_embed']
        agent_input_df = self.review_df["agent_embed"].progress_apply(lambda x: partial_pad_trunc(x))
        self.agent_input_ids = np.array([item[0] for item in agent_input_df.values], dtype=np.long)
        self.agent_attention_mask = np.array([item[1] for item in agent_input_df.values], dtype=np.bool)
        self.agent_token_type_ids = np.array([item[2] for item in agent_input_df.values], dtype=np.long)
        

        if "anecdote_lead_final" in self.review_df.columns:
            # will convert the tag list to multi-label classification format
            # use this label and order to encode the labels. It will print the warning for I don't encode ""
            self.binarized_label = self.review_df["anecdote_lead_final"].astype(float).values

        print("finished!")
Exemple #7
0
def author_top_field(pub2author_df, colgroupby = 'AuthorId', colcountby = 'FieldId', fractional_field_counts = False, show_progress=False):
    """
    Calculate the most frequent field in the authors career.

    Parameters
    ----------
    pub2author_df : DataFrame
        A DataFrame with the author2publication field information.

    colgroupby : str, default 'AuthorId'
        The DataFrame column with Author Ids.  If None then the database 'AuthorId' is used.

    colcountby : str, default 'FieldId'
        The DataFrame column with Citation counts for each publication.  If None then the database 'FieldId' is used.

    fractional_field_counts : bool, default False
        How to count publications that are assigned to multiple fields:
            - If False, each publication-field assignment is counted once.
            - If True, each publication is counted once, contributing 1/#fields to each field.

    Returns
    -------
    DataFrame
        DataFrame with 2 columns: 'AuthorId', 'TopFieldId'

    """

    check4columns(pub2author_df, [colgroupby, 'PublicationId', colcountby])

    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc='Author Top Field', disable= not show_progress)

    if not fractional_field_counts:
        author2field = pub2author_df.groupby(colgroupby)[colcountby].progress_apply(lambda x: x.mode()[0])

    else:
        # first calculate how many fields each publication maps too
        pub2nfields = groupby_count(pub2author_df, colgroupby='PublicationId', colcountby=colcountby)

        # each pub2field mapping is weighted by the number of fields for the publication
        pub2nfields['PublicationWeight'] = 1.0/pub2nfields['PublicationIdCount']
        del pub2nfields[str(colcountby)+'Count']

        # merge counts
        author2field = pub2author_df.merge(pub2nfields, how='left', on='PublicationId')

        # custom weighted mode based on 
        def weighted_mode(adf):
            p = adf.groupby(colcountby)['PublicationWeight'].sum()
            return p.idxmax()

        # now take the weighted mode for each groupby column
        author2field = author2field.groupby(colgroupby).progress_apply(weighted_mode)

    newname_dict = zip2dict([str(colcountby), '0'], ['Top' + str(colcountby)]*2)
    return author2field.to_frame().reset_index().rename(columns=newname_dict)
Exemple #8
0
def compute_disruption_index(pub2ref, show_progress=False):
    """
    Funk, Owen-Smith (2017) A Dynamic Network Measure of Technological Change *Management Science* **63**(3),791-817
    Wu, Wang, Evans (2019) Large teams develop and small teams disrupt science and technology *Nature* **566**, 378–382

    """
    if show_progress:
        print("Starting computation of disruption index.")

    reference_groups = pub2ref.groupby('CitingPublicationId',
                                       sort=False)['CitedPublicationId']
    citation_groups = pub2ref.groupby('CitedPublicationId',
                                      sort=False)['CitingPublicationId']

    def get_citation_groups(pid):
        try:
            return citation_groups.get_group(pid).values
        except KeyError:
            return []

    def disruption_index(citing_focus):
        focusid = citing_focus.name

        # if the focus publication has no references, then it has a disruption of None
        try:
            focusref = reference_groups.get_group(focusid)
        except KeyError:
            return None

        # implementation 1: keep it numpy
        #cite2ref = reduce(np.union1d, [get_citation_groups(refid) for refid in focusref])
        #nj = np.intersect1d(cite2ref, citing_focus.values).shape[0]
        #nk = cite2ref.shape[0] - nj

        # implementation 2: but dicts are faster...
        cite2ref = {
            citeid: 1
            for refid in focusref for citeid in get_citation_groups(refid)
        }
        nj = sum(cite2ref.get(pid, 0) for pid in citing_focus.values)
        nk = len(cite2ref) - nj

        ni = citing_focus.shape[0] - nj

        return (ni - nj) / (ni + nj + nk)

    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc='Disruption Index', disable=not show_progress)

    newname_dict = {
        'CitingPublicationId': 'DisruptionIndex',
        'CitedPublicationId': 'PublicationId'
    }
    return citation_groups.progress_apply(
        disruption_index).to_frame().reset_index().rename(columns=newname_dict)
Exemple #9
0
def cogroupby(df, N):
    adj_mat = spsparse.dok_matrix((N, N), dtype=int)

    def inducedcombos(authorlist):
        if authorlist.shape[0] >= 2:
            for i, j in combinations(authorlist, 2):
                adj_mat[i, j] += 1

    tqdm.pandas(desc='CoAuthorship')
    df.groupby('PublicationId')['AuthorId'].progress_apply(inducedcombos)

    adj_mat = adj_mat + adj_mat.T

    return adj_mat
Exemple #10
0
def groupby_count(df,
                  colgroupby,
                  colcountby,
                  count_unique=True,
                  show_progress=False):
    """
    Group the DataFrame and count the number for each group.

    Parameters
    ----------
    :param df: DataFrame
        The DataFrame.

    :param colgroupby: str
        The column to groupby.

    :param colcountby: str
        The column to count.

    :param count_unique: bool, default True
        If True, count unique items in the rows.  If False, just return the number of rows.

    :param show_progress: bool or str, default False
        If True, display a progress bar for the count.  If str, the name of the progress bar to display.

    Returns
    ----------
    DataFrame
        DataFrame with two columns: colgroupby, colcountby+`Count`
    """

    desc = ''
    if isinstance(show_progress, str):
        desc = show_progress
    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc=desc, disable=not show_progress)

    newname_dict = zip2dict([str(colcountby), '0'],
                            [str(colcountby) + 'Count'] * 2)
    if count_unique:
        count_df = df.groupby(
            colgroupby,
            sort=False)[colcountby].progress_apply(lambda x: x.nunique())
    else:
        count_df = df.groupby(
            colgroupby,
            sort=False)[colcountby].progress_apply(lambda x: x.shape[0])

    return count_df.to_frame().reset_index().rename(columns=newname_dict)
Exemple #11
0
def qfactor(show_progress=False):
    """
    This function calculates the Q-factor for an author.  See [q] for details.

    References
    ----------
    .. [q] Sinatra (2016): "title", *Science*.
           DOI: xxx
    """

    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc='Q-factor', disable=not show_progress)

    # TODO: implement
    return False
Exemple #12
0
def compute_citation_rank(df,
                          colgroupby='Year',
                          colrankby='C10',
                          ascending=True,
                          normed=False,
                          show_progress=False):
    """
    Rank elements in the array from 0 (smallest) to N -1 (largest)

    Parameters
    ----------
    :param df : DataFrame
        A DataFrame with the citation information for each Publication.

    :param colgroupby : str, list
        The DataFrame column(s) to subset by.

    :param colrankby : str
        The DataFrame column to rank by.

    :param ascending : bool, default True
        Sort ascending vs. descending.

    :param normed : bool, default False
        False : rank is from 0 to N -1
        True : rank is from 0 to 1

    :param show_progress : bool, default False
        If True, show a progress bar tracking the calculation.

    Returns
    -------
    DataFrame
        The original dataframe with a new column for rank: colrankby+"Rank"

    """
    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc='Citation Rank', disable=not show_progress)

    df[str(colrankby) +
       "Rank"] = df.groupby(colgroupby)[colrankby].progress_transform(
           lambda x: rank_array(x, ascending, normed))
    return df
def load_dataset(lang_path,
                 tokenizer,
                 max_length,
                 balanced=False,
                 dataset_name="test",
                 limit=None):
    logging.getLogger("transformers.tokenization_utils_base").setLevel(
        logging.ERROR)
    tqdm.pandas(leave=False)
    # Read data
    df = pd.read_csv(lang_path + "/{}.csv".format(dataset_name.split("_")[0]),
                     header=None)
    df.columns = ["sentiment", "review"]
    df["sentiment"] = pd.to_numeric(
        df["sentiment"])  # Sometimes label gets read as string

    # Remove excessively long examples
    lengths = df["review"].progress_apply(lambda x: len(tokenizer.encode(x)))
    df = df[lengths <= max_length].reset_index(
        drop=True)  # Remove long examples

    # Balance classes
    if dataset_name == "train" and balanced:
        positive_examples = df["sentiment"].sum()
        if not limit:
            # Find which class is the minority and set its size as limit
            n = min(positive_examples, df.shape[0] - positive_examples)
        else:
            n = limit
        ones_idx = np.random.choice(np.where(df["sentiment"])[0], size=n)
        zeros_idx = np.random.choice(np.where(df["sentiment"] == 0)[0], size=n)
        df = df.loc[list(ones_idx) + list(zeros_idx)].reset_index(drop=True)
    elif not balanced and limit:
        raise Exception(
            "Must set 'balanced' to True to choose a manual limit.")

    # Convert to TF dataset
    dataset = bert_convert_examples_to_tf_dataset(
        [(Example(text=text, category_index=label))
         for label, text in df.values],
        tokenizer,
        max_length=max_length)
    return df, dataset
Exemple #14
0
def publication_beauty(pub2ref_df,
                       colgroupby='CitedPublicationId',
                       colcountby='CitingPublicationId',
                       show_progress=False):
    """
    Calculate the sleeping beauty and awakening time for each cited publication.  See :cite:`Sinatra2016qfactor` for the derivation.

    The algorithmic implementation can be found in :py:func:`metrics.qfactor`.

    Parameters
    ----------
    pub2ref_df : DataFrame, default None, Optional
        A DataFrame with the temporal citing information information.

    colgroupby : str, default 'CitedPublicationId', Optional
        The DataFrame column with Author Ids.  If None then the database 'CitedPublicationId' is used.

    colcountby : str, default 'CitingPublicationId', Optional
        The DataFrame column with Citation counts for each publication.  If None then the database 'CitingPublicationId' is used.

    Returns
    -------
    DataFrame
        Trajectory DataFrame with 2 columns: 'AuthorId', 'Hindex'

    """

    check4columns(pub2ref_df,
                  ['CitedPublicationId', 'CitingPublicationId', 'CitingYear'])

    tqdm.pandas(desc='Beauty', disable=not show_progress)

    df = groupby_count(pub2ref_df,
                       colgroupby=['CitedPublicationId', 'CitingYear'],
                       colcountby='CitingPublicationId',
                       count_unique=True)

    newname_dict = zip2dict([str(colcountby), '0', '1'],
                            [str(colgroupby) + 'Beauty'] * 2 + ['Awakening'])
    return df.groupby(colgroupby)[colcountby + 'Count'].progress_transform(
        beauty_coefficient).rename(columns=newname_dict)
Exemple #15
0
#TODO: read documentation
!pip install --quiet tqdm==4.59.0

from tqdm.notebook import tqdm
import pytorch_lightning as pl
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

rcParams['figure.figsize'] = 13,7

tqdm.pandas()

pl.seed_everything(7)

df = pd.read_csv('/content/drive/MyDrive/allwind.csv')
df = df.drop(['ISTANBUL WindSpeed(m/s)'], axis = 1)
df = df.drop(['Unnamed: 0'] , axis = 1 )

df['Date'] = pd.date_range(start = '20180201' , freq = 'H' , periods = len(df))
df.dropna(inplace = True)
df.head()

# preprocessing:
rowsData = []
for i,row in tqdm(df.iterrows() , total = len(df)):
Exemple #16
0
def coauthorship_network(paa_df,
                         focus_author_ids=None,
                         focus_constraint='authors',
                         show_progress=False):
    """
    Create the co-authorship network.

    Parameters
    ----------
    :param paa_df : DataFrame
        A DataFrame with the links between authors and publications.

    :param focus_author_ids : numpy array or list, default None
        A list of the AuthorIds to seed the coauthorship-network.

    :param focus_constraint : str, default `authors`
        If focus_author_ids is not None:
            `authors` : the `focus_author_ids' defines the node set, giving only the co-authorships between authors in the set.
            `publications` : the publication history of `focus_author_ids' defines the edge set, giving the co-authorhips where at least 
                                one author from `focus_author_ids' was involved.
            'ego' : the `focus_author_ids' defines a seed set, such that all authors must have co-authored at least one publication with 
                                an author from `focus_author_ids', but co-authorships are also found between the second-order author sets. 

    :param show_progress : bool, default False
        If True, show a progress bar tracking the calculation.

    Returns
    -------
    coo_matrix
        The adjacency matrix for the co-authorship network

    author2int, dict
        A mapping of AuthorIds to the row/column of the adjacency matrix.

    """
    required_columns = ['AuthorId', 'PublicationId']
    check4columns(paa_df, required_columns)
    paa_df = paa_df[required_columns].dropna()

    if not focus_author_ids is None:
        focus_author_ids = np.sort(focus_author_ids)

        # identify the subset of the publications we need to form the network
        if focus_constraint == 'authors':
            # take only the publication-author links that have an author from the `focus_author_ids'
            paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values,
                                            focus_author_ids)]

        elif focus_constraint == 'publications':
            # take all publications authored by an author from the `focus_author_ids'
            focus_pubs = np.sort(paa_df.loc[isin_sorted(
                paa_df['AuthorId'].values,
                focus_author_ids)]['PublicationId'].unique())
            # then take only the subset of publication-author links inducded by these publications
            paa_df = paa_df.loc[isin_sorted(paa_df['PublicationId'].values,
                                            focus_pubs)]
            del focus_pubs

        elif focus_constraint == 'ego':
            # take all publications authored by an author from the `focus_author_ids'
            focus_pubs = np.sort(paa_df.loc[isin_sorted(
                paa_df['AuthorId'].values,
                focus_author_ids)]['PublicationId'].unique())
            # then take all authors who contribute to this subset of publications
            focus_author_ids = np.sort(paa_df.loc[isin_sorted(
                paa_df['PublicationId'].values,
                focus_pubs)]['AuthorId'].unique())
            del focus_pubs
            # finally take the publication-author links that have an author from the above ego subset
            paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values,
                                            focus_author_ids)]

    #  map authors to the row/column of the adj mat
    author2int = {
        aid: i
        for i, aid in enumerate(np.sort(paa_df['AuthorId'].unique()))
    }
    Nauthors = paa_df['AuthorId'].nunique()

    adj_mat = sparse.dok_matrix((Nauthors, Nauthors), dtype=int)

    def coauthor_cluster(author_list):
        if author_list.shape[0] >= 2:
            for ia, ja in combinations(author_list, 2):
                adj_mat[author2int[ia], author2int[ja]] += 1

    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc='CoAuthorship Relations',
                leave=True,
                disable=not show_progress)

    # go through all publications and apply the coauthorship edge generator
    paa_df.groupby('PublicationId')['AuthorId'].progress_apply(
        coauthor_cluster)

    adj_mat = adj_mat + adj_mat.transpose()

    return adj_mat, author2int
Exemple #17
0
def deflate(nominal_values,
            nominal_dates,
            real_date,
            index='ipca',
            progress_bar=False,
            on_jupyter=False):
    """
    deflatebr uses data from the Brazilian Institute for Applied Economic 
    Research's API (IPEADATA) to adjust nominal Brazilian Reais for inflation.

    Parameters
    ----------
    nominal_values : [int, float, np.array or pd.Series]
        An array containing nominal Brazilian Reais to deflate.
    nominal_dates : [str, date or list]
        A date vector with corresponding nominal dates (i.e., when nominal values were measured).
        Values are set to the previous month, following the
        standard methodology used by the Brazilian Central Bank
        https://www3.bcb.gov.br/CALCIDADAO/publico/metodologiaCorrigirIndice.do?method=metodologiaCorrigirIndice
    real_date : str
        A value indicating the reference date to deflate nominal values in the format
        'YYYY-MM' (e.g., '2018-01' for January 2018).
    index : str
        Indicates the price index used to deflate nominal Reais. 
        Valid options are: 'ipca', 'igpm,'igpdi', 'ipc', and 'inpc'.
    progress_bar : bool
        True to display a progress bar. False by default.
    on_jupyter : bool
        True to display an HTML progress bar on jupyter notebook or jupyter lab.

    Returns
    -------
    np.ndarray : an array of deflated values.
    
    """
    # Prepare inputs
    nominal_values = np.array(nominal_values)
    real_date = clean_real_date(real_date)

    # If it is just one value, turn into a list
    if isinstance(nominal_dates, str):
        nominal_dates = [pd.to_datetime(nominal_dates)]
    elif isinstance(nominal_dates, date):
        nominal_dates = [nominal_dates]

    if len(nominal_dates) > 1:
        nominal_dates = pd.to_datetime(nominal_dates)

    # Round dates to first of each month and get one month earlier
    nominal_dates = [round_date_to_month(dt) for dt in nominal_dates]

    # Test index input
    if index not in ['ipca', 'igpm', 'igpdi', 'ipc', 'inpc']:
        raise Exception(
            "index must be one of 'ipca', 'igpm', 'igpdi', 'ipc', 'inpc'")

    # Request to IPEA API
    if index == 'ipca':
        q = "http://ipeadata.gov.br/api/odata4/ValoresSerie(SERCODIGO='PRECOS12_IPCA12')"
    elif index == 'igpm':
        q = "http://ipeadata.gov.br/api/odata4/ValoresSerie(SERCODIGO='IGP12_IGPM12')"
    elif index == 'igpdi':
        q = "http://ipeadata.gov.br/api/odata4/ValoresSerie(SERCODIGO='IGP12_IGPDI12')"
    elif index == 'ipc':
        q = "http://ipeadata.gov.br/api/odata4/ValoresSerie(SERCODIGO='IGP12_IPC12')"
    elif index == 'inpc':
        q = "http://ipeadata.gov.br/api/odata4/ValoresSerie(SERCODIGO='PRECOS12_INPC12')"

    res = requests.get(q)
    indice = pd.DataFrame.from_dict(json.load(StringIO(res.text))['value'])
    indice['VALDATA'] = pd.to_datetime(indice['VALDATA'],
                                       utc=True).dt.date.astype(str)

    # Calculate changes in values
    real_indx = indice.loc[indice.VALDATA == real_date, 'VALVALOR'].values
    df = pd.DataFrame({'nom_values': nominal_values, 'VALDATA': nominal_dates})

    df = df.merge(indice[['VALDATA', 'VALVALOR']], how='left', on='VALDATA')

    if progress_bar:
        if on_jupyter:
            from tqdm.notebook import tqdm
            tqdm.pandas()
            df['deflated'] = df[['nom_values',
                                 'VALVALOR']].progress_apply(lambda x: (
                                     (real_indx / x[1]) * x[0])[0],
                                                             axis=1)
        else:
            from tqdm import tqdm
            tqdm.pandas()
            df['deflated'] = df[['nom_values',
                                 'VALVALOR']].progress_apply(lambda x: (
                                     (real_indx / x[1]) * x[0])[0],
                                                             axis=1)
    else:
        df['deflated'] = df[['nom_values', 'VALVALOR']].apply(lambda x: (
            (real_indx / x[1]) * x[0])[0],
                                                              axis=1)

    return df.deflated.values
nltk.download("stopwords")

import pandas as pd
import numpy as np
import spacy
import statistics
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from bs4 import BeautifulSoup

from tqdm.notebook import trange, tqdm
tqdm.pandas(desc="Progress")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`

import numpy as np
import matplotlib.pyplot as plt

import pickle

import warnings
warnings.filterwarnings("ignore")

#Open Data in Pandas dataframe
path = '/content/drive/My Drive/Case Studies/Quora Question Pairs/train.csv'
#Loading data into pandas dataframe
Exemple #19
0
def align_publications(df1, df2=None, columns2match_exact=['Year'], column2match_approx='Title', ntop=1, cosine_lower_bound=0.75,
    use_threads=False, n_jobs=2, lev_lower_bound=0.9, show_progress=False):
    """
    Fast way to match publications between two datasets.  We first match subsets of exact values
    between the two DataFrames, as specified by `columns2match_exact`.
    We then use a fast approximate string matching to match values in `columns2match_approx` to within a threshold.

    Parameters
    ----------
    :param df1 : DataFrame
        A DataFrame with the publication information.

    :param df2 : DataFrame, Optional
        Another DataFrame with the publication information.  If None, then df1 is used again.

    :param columns2match_exact : list, Default: ['Year']
        The columns to match exactly between DataFrames.

    :param column2match_approx : list, Default: 'Title'
        The column to match approximately between DataFrames.

    :param ntop : int, Default 1
        The number of best matches from df2 to return for each row of df1.

    :param lower_bound : float, Default 0.75
        The lowerbound for cosine similarity when doing a fuzzy string match.

    :param use_threads : bool, Default False
        Use multithreading when calculating cosine similarity for fuzzy string matching.

    :param n_jobs : int, Optional, Default 2
        If use_threads is True, the number of threads to use in the parall calculation.

    :param show_progress : bool, Default False
        If True, show a progress bar tracking the calculation.

    """
    # we can do an exact match from merge
    if (columns2match_exact is None or len(columns2match_exact) > 0) and (column2match_approx is None or len(column2match_approx) == 0):
        # get the index name and reset the index to force it as a column
        indexcol = df2.index.name
        df2 = df2.reset_index(drop=False)
        # now merge the dataframes and drop duplicates giving an exact match
        mdf = df1.merge(df2[columns2match_exact + [indexcol]], how='left', on=columns2match_exact)
        mdf.drop_duplicates(subset=columns2match_exact, keep='first', inplace=True)
        return mdf[indexcol]

    # otherwise, if there is a column to match approximately then we need to prep for fuzzy matching
    elif len(column2match_approx) > 0:

        # we take a two-step approach to fuzzy matching
        # 1) first we employ a super fast but not very accurate cosine-similarity
        #    matching to narrow down the possible pair-wise matches
        #    for each string, we create feature vectors from 3-char counts
        tfidf = TfidfVectorizer(min_df=1, ngram_range = (3,3), analyzer='char', lowercase=False)
        tfidf1 = tfidf.fit_transform(df1[column2match_approx])
        tfidf2 = tfidf.transform(df2[column2match_approx])

        matches = np.empty(tfidf1.shape[0])
        matches[:] = np.NaN

        # if there are no columns to match exactly
        if (columns2match_exact is None or len(columns2match_exact) == 0):

            # 1) first do the all-to-all cosine similarity and extract up to the ntop best possible matches
            co= awesome_cossim_topn(tfidf1, tfidf2.T, ntop=ntop, lower_bound=cosine_lower_bound, use_threads=use_threads, n_jobs=n_jobs).tocoo()

            # 2) now use the Levenshtein
            for row in tqdm(set(co.row), desc="Align Publications", disable=not show_progress):
                rowcol = co.col[co.row==row]
                argmatch, lev_dist = levenshtein_best_match(df1.loc[row, column2match_approx], df2.iloc[rowcol][column2match_approx])
                if lev_dist >= lev_lower_bound:
                    matches[row] = rowcol[argmatch]


        else:

            df2groups = df2.groupby(columns2match_exact)

            def subgroup_match(subdf):
                if not df2groups.indices.get(subdf.name, None) is None:
                    sub_tfidf1 = tfidf1[subdf.index.values]
                    sub_tfidf2 = tfidf2[df2groups.indices[subdf.name]]
                    co = awesome_cossim_topn(sub_tfidf1, sub_tfidf2.transpose(), ntop=ntop, lower_bound=cosine_lower_bound, use_threads=use_threads, n_jobs=n_jobs).tocoo()

                    # 2) now use the Levenshtein distance to find the best match
                    for row in set(co.row):
                        rowcol = co.col[co.row==row]
                        argmatch, lev_dist = levenshtein_best_match(subdf.iloc[row][column2match_approx],
                            df2.iloc[df2groups.indices[subdf.name][rowcol]][column2match_approx])
                        if lev_dist >= lev_lower_bound:
                            matches[subdf.index.values[row]] = df2groups.indices[subdf.name][rowcol[argmatch]]

            # register our pandas apply with tqdm for a progress bar
            tqdm.pandas(desc='Publication Matches', disable= not show_progress)

            df1.groupby(columns2match_exact, group_keys=True).progress_apply(subgroup_match)

        return matches
Exemple #20
0
def augment_annotation(bam, ranges):

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        def extract_annot(row):
            # bam_data['reference_start'] >= 155179779
            # start_data = bam_data[bam_data['reference_start'] >= 155179779]

            # TODO: There is something FUBAR in the start_data calculation

            bam_data = bam.get_sam_annotation(row.Chromosome, row.Start,
                                              row.End)

            if bam_data is None:
                return 0, 0, 0, 0, \
                   0, 0, 0, 0, 0, 0, 0, 0, \
                   0, 0

            start_data = bam_data.loc[(
                (bam_data.reference_start + bam_data.reference_length <=
                 row.End) & (bam_data.strand == "+") |
                ((bam_data.strand == "-") &
                 (bam_data.reference_start >= row.Start)))]
            #start_data = bam_data[bam_data['reference_start'] >= row.Start]

            # rstart - the number of reads that start within the given interval
            rstart = len(start_data)
            # basesstart - the number of bases contained within rstart
            bases_start = start_data.reference_length.sum()
            # meanreadlen - mean read length for any reads within this interval
            mean_read_len = bam_data.reference_length.mean()
            # startreadlen - mean read length for reads that start within interval
            start_read_len = start_data.reference_length.mean()
            # strandp
            strand_p = (bam_data.strand == '+').sum()
            # strandn
            strand_n = (bam_data.strand == '-').sum()
            # mapq - mapq for reads starting in segment
            mapq = (-10 * log10(
                (10**(start_data.mapping_quality / -10)).mean()))
            # map0 - mapq for reads overlapping the segment
            map0 = (-10 * log10((10**(bam_data.mapping_quality / -10)).mean()))
            # readq - per read q score for reads starting in segment
            readq = (-10 * log10(
                (10**(start_data.mapped_read_q / -10)).mean()))
            # read0 - per read q score for reads overlapping segment
            read0 = (-10 * log10((10**(bam_data.mapped_read_q / -10)).mean()))
            # nm - this is the #NM mismatch count; reads starting in segment
            nm = start_data.nm.sum()
            # cigar_del
            cigar_d = start_data.cigar_d.sum()
            # cigar_ins
            cigar_i = start_data.cigar_i.sum()
            # cigar_mapped
            cigar_m = start_data.cigar_m.sum()
            ##### and some local sequence context annotations

            # gccount

            # ncount

            return rstart, bases_start, mean_read_len, start_read_len, \
                   strand_p, strand_n, mapq, map0, readq, read0, nm, cigar_m, \
                   cigar_i, cigar_d

        tqdm.pandas()
        df_data = ranges.df

        df_data[[
            'rstart', 'bases_start', 'mean_read_len', 'start_read_len',
            'strand_p', 'strand_n', 'mapq', 'map0', 'readq', 'read0', 'nm',
            'cigar_m', 'cigar_i', 'cigar_d'
        ]] = df_data.progress_apply(extract_annot,
                                    axis=1,
                                    result_type='expand')
        return pr.PyRanges(df_data)
Exemple #21
0
def create_person_offer(transcript, portfolio, profile, person_transaction):
    """ A function to generete a new df with person and offer per row,

    Arguments:
        transcript -- Dataframe that contains all events
        portfolio -- Dataframe that contains datails of offers 
        profile -- Dataframe that contains details about customers

    Returns:
        person_offer_df -- new DataFrame
    """
    tqdm.pandas()
    to_be_appended = None

    # This will not include transaction, so we need another new table for those.
    for (person_id, offer_index), transcript_grouped in tqdm(
            transcript.dropna(subset=['offer_index']).groupby(
                ['person', 'offer_index'])):
        this_offer = portfolio.loc[offer_index]
        this_person = profile.loc[person_id]
        to_be_appended = append_one_person_offer(to_be_appended, this_offer,
                                                 person_id, offer_index,
                                                 transcript_grouped,
                                                 this_person)

    person_offer_df = pd.DataFrame(to_be_appended)

    # TODO, the stuff above and the stuff below was originally made at completly different times and
    # was different files and functions, now I put it into one, however, there's still probably alot
    # that can be done together instead of looping multiple times on the same stuff... but not reealy needed, as it works
    # but could probably make it way faster...
    person_offer_df['before_start'] = 0
    person_offer_df['same_day_start'] = 0
    person_offer_df['after_start'] = 0
    person_offer_df['before_view'] = 0
    person_offer_df['same_day_view'] = 0
    person_offer_df['after_view'] = 0
    person_offer_df['before_complete'] = 0
    person_offer_df['same_day_complete'] = 0
    person_offer_df['after_complete'] = 0
    person_offer_df['w_before'] = 0
    person_offer_df['sum_during'] = 0
    person_offer_df['mean_during'] = 0
    person_offer_df['w_after'] = 0
    person_offer_df = person_offer_df.progress_apply(
        get_before_after_mean, person_transaction=person_transaction, axis=1)

    person_offer_df['viewed_reltime'] = np.nan
    person_offer_df['completed_reltime'] = np.nan

    def absulute2relative_time(x):
        """Converts absolute time (hours since start of simulation)
        to hours since offer recieved (start)
        """
        if x.viewed:
            x.viewed_reltime = x.viewed_time - x.start

        if x.completed:
            x.completed_reltime = x.completed_time - x.start

        return x

    person_offer_df = person_offer_df.progress_apply(absulute2relative_time,
                                                     axis=1)

    #makes it easier to access these combinations
    person_offer_df['complete_viewed'] = (
        person_offer_df['completed'] & person_offer_df['viewed']).astype(int)
    person_offer_df['complete_not_viewed'] = (
        person_offer_df['completed'] & ~person_offer_df['viewed']).astype(int)
    person_offer_df['not_complete_not_viewed'] = (
        ~person_offer_df['completed'] & ~person_offer_df['viewed']).astype(int)
    person_offer_df['not_complete_viewed'] = (
        ~person_offer_df['completed'] & person_offer_df['viewed']).astype(int)
    person_offer_df['completed'] = person_offer_df['completed'].astype(int)
    person_offer_df['viewed'] = person_offer_df['viewed'].astype(int)

    #calculates diff in sales before and after an event
    for x in ['start', 'view', 'complete']:
        person_offer_df[f'diff_{x}'] = person_offer_df[
            f'after_{x}'] - person_offer_df[f'before_{x}']

    person_offer_df[f'diff_offer'] = person_offer_df[
        f'w_after'] - person_offer_df[f'w_before']

    #recalculates became_member_on to member_since instead (where newest member is 0) in days
    person_offer_df['became_member_on'] = pd.to_datetime(
        person_offer_df['became_member_on'], format='%Y-%m-%d')
    person_offer_df['member_since_days'] = (
        person_offer_df['became_member_on'].max() -
        person_offer_df['became_member_on']).dt.days

    #remve these wrong ages and turn to NaN
    person_offer_df['age'] = person_offer_df['age'].apply(lambda x: np.nan
                                                          if x == 118 else x)

    return person_offer_df
Exemple #22
0
    def run(
        self,
        value: Optional[str] = None,
        data: Optional[pd.DataFrame] = None,
        timespan: Optional[TimeSpan] = None,
        options: Optional[Iterable[str]] = None,
        **kwargs,
    ) -> TIEnrichResult:
        """
        Return an enriched set of Alerts.

        Parameters
        ----------
        timespan : TimeSpan
            Timespan for queries
        options : Optional[Iterable[str]], optional
            List of options to use, by default None.
            A value of None means use default options.
            Options prefixed with "+" will be added to the default options.
            To see the list of available options type `help(cls)` where
            "cls" is the notebooklet class or an instance of this class.
        value: Optional[str], optional
            If you want to filter Alerts based on a specific entity specify
            it as a string.
        data: Optional[pd.DataFrame], optional
            If you have alerts in a DataFrame you can pass them rather than
            having the notebooklet query alerts.

        Returns
        -------
        TIEnrichResult
            Result object with attributes for each result type.

        Raises
        ------
        MsticnbMissingParameterError
            If required parameters are missing

        MsticnbDataProviderError
            If data is not avaliable

        """
        # This line use logic in the superclass to populate options
        # (including default options) into this class.
        super().run(value=value,
                    data=data,
                    timespan=timespan,
                    options=options,
                    **kwargs)

        if not timespan and data is None:
            raise MsticnbMissingParameterError("timespan.")

        # If data is not provided, query Sentinel to get it.
        if data is None:
            nb_print("Collecting alerts")
            if value is not None:
                data = _get_all_alerts(self.query_provider, timespan, value)
            else:
                data = _get_all_alerts(self.query_provider, timespan)

        # Create a result class
        # Add description to results for context
        self._last_result = TIEnrichResult(description=f"""Enriched alerts,
                            with the filter of {value}""")

        # Establish TI providers
        if "tilookup" in self.data_providers.providers:
            ti_prov = self.data_providers.providers["tilookup"]
        else:
            raise MsticnbDataProviderError("No TI providers avaliable")

        if isinstance(data, pd.DataFrame) and not data.empty:
            data["Entities"] = data["Entities"].apply(_entity_load)
            tqdm.pandas(desc="TI lookup progress")
            ti_sec = False
            if "secondary" in self.options:
                ti_sec = True
            md("""Alerts enriched with threat intelligence -
                 TI Risk is the the hightest score provided by any of
                 the configured providers.""")
            data["TI Risk"] = data.progress_apply(
                lambda row: _lookup(row, ti_prov, secondary=ti_sec), axis=1)
            if not self.silent:
                display(data[[
                    "StartTimeUtc",
                    "AlertName",
                    "Severity",
                    "TI Risk",
                    "Description",
                ]].sort_values(by=["StartTimeUtc"]).style.applymap(
                    _color_cells).hide_index())
            if "details" in self.options:
                self._last_result.picker = _alert_picker(data,
                                                         ti_prov,
                                                         secondary=ti_sec,
                                                         silent=self.silent)
        else:
            raise MsticnbDataProviderError("No alerts avaliable")

        self._last_result.enriched_results = data

        return self._last_result