Example #1
0
class DataFrameStringIndexing(object):

    goal_time = 0.2

    def setup(self):
        index = tm.makeStringIndex(1000)
        columns = tm.makeStringIndex(30)
        self.df = DataFrame(np.random.randn(1000, 30), index=index,
                            columns=columns)
        self.idx_scalar = index[100]
        self.col_scalar = columns[10]
        self.bool_indexer = self.df[self.col_scalar] > 0
        self.bool_obj_indexer = self.bool_indexer.astype(object)

    def time_get_value(self):
        with warnings.catch_warnings(record=True):
            self.df.get_value(self.idx_scalar, self.col_scalar)

    def time_ix(self):
        self.df.ix[self.idx_scalar, self.col_scalar]

    def time_loc(self):
        self.df.loc[self.idx_scalar, self.col_scalar]

    def time_getitem_scalar(self):
        self.df[self.col_scalar][self.idx_scalar]

    def time_boolean_rows(self):
        self.df[self.bool_indexer]

    def time_boolean_rows_object(self):
        self.df[self.bool_obj_indexer]
Example #2
0
    def compute_tf_idf_queries(self):
        # Find total number of document
        results = self.cursor.execute('SELECT seq FROM sqlite_sequence WHERE name=\'{}\''.format('documents'))
        tmp = results.fetchone()
        total_doc = tmp[0]

        results = self.cursor.execute('SELECT did, total_word, path FROM documents')
        tmp = results.fetchall()
        documents_df = DataFrame(tmp, columns=['did', 'total_word', 'path'])
        documents_df['tf_idf'] = 0.0

        no_docterm = {}

        for query in self.queries:
            no_docterm[query] = 0

        for index, row in documents_df.iterrows():
            path = row['path']
            with codecs.open(path, 'rt') as f:
                text = f.read()
                for query in self.queries:
                    if query in text.decode('utf-8').lower():
                        no_docterm[query] += 1

        for query in self.queries:
            for index, row in documents_df.iterrows():
                total_word = row['total_word']
                path = row['path']

                with codecs.open(path, 'rt') as f:
                    text = f.read()

                tf_idf = self._compute_tf_idf_queries(text, total_word, total_doc, no_docterm[query])
                cur_tf_idf = documents_df.get_value(index, 'tf_idf')
                documents_df.set_value(index, 'tf_idf', cur_tf_idf + tf_idf)

        results = self.cursor.execute('SELECT did, type, entity FROM entities')
        tmp = results.fetchall()
        df = DataFrame(tmp, columns=['did', 'e_type', 'entity'])
        df['tf_idf'] = 0.0

        for index, row in df.iterrows():
            did = row['did']
            tf_idf = documents_df[documents_df['did'] == did]['tf_idf'].values[0]
            df.set_value(index, 'tf_idf', tf_idf)

        del df['did']
        df = df.groupby(['e_type', 'entity']).sum().reset_index()
        return df
Example #3
0
    def set_targets_from_file(self, filename = None, year = None):
        '''
        Loads targets from file and display them in the frame
        '''

        if year is None:
            year     = str(CONF.get('simulation','datesim').year)

        if filename is None:
            fname = "actualisation_groups.h5"
            data_dir = CONF.get('paths', 'data_dir')
            filename = os.path.join(data_dir, fname)

        store = HDFStore(filename)

        # Builds openfisca variables from irpp declaration variables
        df_c = store["corresp"]
        of_vars = dict()
        for col in df_c.columns:
            of_vars[col] = list(unique(df_c[col]).dropna())

        df_a = store['amounts']
        df_b = store['benef']
        store.close()

        df_a1 = DataFrame( {'amount' : df_a[year]})

        df_a = DataFrame( columns = ['amount'] )

        for of_var, declar_vars_list in of_vars.iteritems():
            amount = 0
            for case in declar_vars_list:
                a = df_a1.get_value(case, 'amount')
                if a is not NaN:
                    amount += a
            df_a1.drop(declar_vars_list, axis = 0, inplace=True)
            row = DataFrame(dict(amount = [amount]), index = [of_var] )
            df_a = df_a.append(row)

        df_a = df_a.append(df_a1)

        self.vars_df = df_a
        self.vars_df.index.names = ['var']
        self.fill_vars()
        self.fill_coeffs()
Example #4
0
class Scores(AnnotationMixin, object):
    """

    Parameters
    ----------
    uri : str, optional

    modality : str, optional

    Returns
    -------
    scores : `Scores`

    Examples
    --------

        >>> s = Scores(uri='video', modality='speaker')
        >>> s[Segment(0,1), 's1', 'A'] = 0.1
        >>> s[Segment(0,1), 's1', 'B'] = 0.2
        >>> s[Segment(0,1), 's1', 'C'] = 0.3
        >>> s[Segment(0,1), 's2', 'A'] = 0.4
        >>> s[Segment(0,1), 's2', 'B'] = 0.3
        >>> s[Segment(0,1), 's2', 'C'] = 0.2
        >>> s[Segment(2,3), 's1', 'A'] = 0.2
        >>> s[Segment(2,3), 's1', 'B'] = 0.1
        >>> s[Segment(2,3), 's1', 'C'] = 0.3

    """
    @classmethod
    def from_df(
        cls, df,
        uri=None, modality=None, aggfunc=np.mean
    ):
        """

        Parameters
        ----------
        df : DataFrame
            Must contain the following columns:
            'segment', 'track', 'label' and 'value'
        uri : str, optional
            Resource identifier
        modality : str, optional
            Modality
        aggfunc : func
            Value aggregation function in case of duplicate (segment, track,
            label) tuples

        Returns
        -------

        """
        A = cls(uri=uri, modality=modality)
        A._df = pivot_table(
            df, values=PYANNOTE_SCORE,
            rows=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], cols=PYANNOTE_LABEL,
            aggfunc=aggfunc
        )
        return A

    def __init__(self, uri=None, modality=None):
        super(Scores, self).__init__()

        index = MultiIndex(
            levels=[[], []], labels=[[], []],
            names=[PYANNOTE_SEGMENT, PYANNOTE_TRACK]
        )

        self._df = DataFrame(index=index, dtype=np.float64)
        self.modality = modality
        self.uri = uri
        self._timelineHasChanged = True

    # del scores[segment]
    # del scores[segment, :]
    # del scores[segment, track]
    def __delitem__(self, key):

        if isinstance(key, Segment):
            segment = key
            self._df = self._df.drop(segment, axis=0)
            self._timelineHasChanged = True

        elif isinstance(key, tuple) and len(key) == 2:
            segment, track = key
            self._df = self._df.drop((segment, track), axis=0)
            self._timelineHasChanged = True

        else:
            raise KeyError('')

    # value = scores[segment, track, label]
    def __getitem__(self, key):
        segment, track, label = key
        return self._df.get_value((segment, track), label)

    def get_track_scores(self, segment, track):
        """Get all scores for a given track.

        Parameters
        ----------
        segment : Segment
        track : hashable
            segment, track must be a valid track

        Returns
        -------
        scores : dict
            {label: score} dictionary
        """
        return {l: self._df.get_value((segment, track), l) for l in self._df}

    # scores[segment, track, label] = value
    def __setitem__(self, key, value):
        segment, track, label = key
        self._df = self._df.set_value((segment, track), label, value)
        self._timelineHasChanged = True

    def labels(self, unknown=True):
        """List of labels

        Parameters
        ----------
        unknown : bool, optional
            When False, do not return Unknown instances
            When True, return any label (even Unknown instances)

        Returns
        -------
        labels : list
            Sorted list of existing labels

        Remarks
        -------
            Labels are sorted based on their string representation.
        """
        labels = sorted(self._df.columns, key=str)
        if unknown:
            return labels
        else:
            return [l for l in labels if not isinstance(l, Unknown)]

    def itervalues(self):
        """Iterate over annotation as (segment, track, label, value) tuple"""

        # make sure segment/track pairs are sorted
        self._df = self._df.sort_index()

        # yield one (segment, track, label) tuple per loop
        labels = self._df.columns
        for (segment, track), columns in self._df.iterrows():
            for label in labels:
                value = columns[label]
                if np.isnan(value):
                    continue
                else:
                    yield segment, track, label, value

    def _rank(self, invert):

        if invert:
            direction = 1.

        else:
            direction = -1.

        def nan_rank(data):

            # replace NaN by -inf or +inf depending on the requested direction
            finite = np.isfinite(data)
            fixed = np.where(finite, direction*data, -direction*np.inf)

            # do the actual argsort
            indices = np.argsort(fixed)
            # get rank from argsort
            rank = np.argsort(indices)

            # special treatment for inverted NaN scores
            # (we want ranks to start at 0 even in case of NaN)
            if invert:
                rank = np.where(finite, rank-(len(data)-np.sum(finite)), np.nan)
            else:
                rank = np.where(finite, rank, np.nan)
            return rank

        return self._df.apply(nan_rank, axis=1)

    def rank(self, invert=False):
        """

        Parameters
        ----------
        invert : bool, optional
            By default, larger scores are better.
            Set `invert` to True to indicate smaller scores are better.

        Returns
        -------
        rank : `Scores`

        """
        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = self._rank(invert)
        return A

    def nbest(self, n, invert=False):
        """

        Parameters
        ----------
        n : int
            Size of n-best list
        invert : bool, optional
            By default, larger scores are better.
            Set `invert` to True to indicate smaller scores are better.

        Returns
        -------
        nbest : `Scores`
            New scores where only n-best are kept.

        """
        df = self._df.copy()
        nbest = self._rank(invert) < n
        df[~nbest] = np.nan

        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = df

        return A

    def subset(self, labels, invert=False):
        """Scores subset

        Extract scores subset based on labels

        Parameters
        ----------
        labels : set
            Set of labels
        invert : bool, optional
            If invert is True, extract all but requested `labels`

        Returns
        -------
        subset : `Scores`
            Scores subset.
        """

        if not isinstance(labels, set):
            raise TypeError('labels must be provided as a set of labels.')

        if invert:
            labels = set(self.labels()) - labels
        else:
            labels = labels & set(self.labels())

        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = self._df[list(labels)]

        return A

    def to_annotation(self, threshold=-np.inf, posterior=False):
        """

        Parameters
        ----------
        threshold : float, optional
            Each track is annotated with the label with the highest score.
            Yet, if the latter is smaller than `threshold`, label is replaced
            with an `Unknown` instance.
        posterior : bool, optional
            If True, scores are posterior probabilities in open-set
            identification. If top model posterior is higher than unknown
            posterior, it is selected. Otherwise, label is replaced with an
            `Unknown` instance.
        """

        annotation = Annotation(uri=self.uri, modality=self.modality)
        if not self:
            return annotation

        best = self.nbest(1, invert=False)

        if posterior:

            # compute unknown posterior
            func = lambda p: 1. - np.nansum(p, axis=1)
            Pu = self.apply(func, new_columns=['_'])

            # threshold best target posterior
            # with unknown posterior and threshold
            for segment, track, label, value in best.itervalues():

                if value < Pu[segment, track, '_'] or value < threshold:
                    label = Unknown()

                annotation[segment, track] = label

        else:

            # threshold best target score with threshold
            for segment, track, label, value in best.itervalues():
                if value < threshold:
                    label = Unknown()
                annotation[segment, track] = label

        return annotation

    def map(self, func):
        """Apply function to all values"""
        A = self.__class__(uri=self.uri, modality=self.modality)
        A._df = func(self._df)
        return A

    def apply(self, data_func, new_index=None, new_columns=None):
        """Apply `data_func` on internal numpy array

        Parameters
        ----------
        data_func : func
            Function expecting (index x columns) numpy array as input
        new_index : iterable, optional
            When provided, these will be the index of returned array.
        new_columns : iterable, optional
            When provided, these will be the columns of returned array.
        """
        new_data = data_func(self._df.values)

        if new_index is None:
            new_index = self._df.index

        if new_columns is None:
            new_columns = self._df.columns

        df = DataFrame(
            data=new_data,
            index=new_index,
            columns=new_columns)

        new_scores = self.__class__(uri=self.uri, modality=self.modality)
        new_scores._df = df

        return new_scores

    def _repr_png_(self):
        from pyannote.core.notebook import repr_scores
        return repr_scores(self)
#%% 
d['a']        # Series,列
#%%
d[['a','c']]  # DataFrame,列
#%%
d[:5]         # DataFrame,行
#%% 
d.ix[:5]      # position-based,行
#%%
d1.ix[:5]     # label-based,行
#%%           
d.irow(0)     # Series
#%% 
d.icol(0)     # Series
#%%
d.get_value('e','a')    # get_value(row_name,col_name)
#%% 强制使用位置来访问元素的方法
d.iget_value(0,1)       # iget_value(irow,icol)  

#%% 使用条件过滤
d[d>5]
#%% 
d[d.a>5]
#%%
d[(d>5)&(d%3==0)]

#%% 使用条件过滤的本质
d>5       # DataFrame
#%%       
d.a>5     # Series
#%% 可以自己构造一个Series
class Aggregates(object):
    filter_by = None
    labels = collections.OrderedDict((
        ('var', u"Mesure"),
        ('entity', u"Entité"),
        ('dep', u"Dépenses\n(millions d'€)"),
        ('benef', u"Bénéficiaires\n(milliers)"),
        ('dep_default', u"Dépenses initiales\n(millions d'€)"),
        ('benef_default', u"Bénéficiaires\ninitiaux\n(milliers)"),
        ('dep_real', u"Dépenses\nréelles\n(millions d'€)"),
        ('benef_real', u"Bénéficiaires\nréels\n(milliers)"),
        ('dep_diff_abs', u"Diff. absolue\nDépenses\n(millions d'€)"),
        ('benef_diff_abs', u"Diff absolue\nBénéficiaires\n(milliers)"),
        ('dep_diff_rel', u"Diff. relative\nDépenses"),
        ('benef_diff_rel', u"Diff. relative\nBénéficiaires"),
        ))  # TODO: localize
    show_default = False
    show_diff = True
    show_real = True
    survey_scenario = None
    totals_df = None
    varlist = None

    def __init__(self, survey_scenario = None):
        if survey_scenario is not None:
            self.set_survey_scenario(survey_scenario)

    def clear(self):
        self.totals_df = None

    def compute(self):
        """
        Compute the whole table
        """
        self.compute_aggregates(self.filter_by)
        self.load_amounts_from_file()
        self.compute_real()
        self.compute_diff()

    def compute_aggregates(self, filter_by = None):
        """
        Compute aggregate amounts
        """
        column_by_name = self.simulation.tax_benefit_system.column_by_name
        V = []
        M = {'data': [], 'default': []}
        B = {'data': [], 'default': []}
        U = []

        M_label = {'data': self.labels['dep'],
                   'default': self.labels['dep_default']}
        B_label = {'data': self.labels['benef'],
                   'default': self.labels['benef_default']}

        for var in self.varlist:
            # amounts and beneficiaries from current data and default data if exists
            montant_benef = self.get_aggregate(var, filter_by)
            V.append(column_by_name[var].label)
            entity = column_by_name[var].entity_key_plural

            U.append(entity)
            for dataname in montant_benef:
                M[dataname].append(montant_benef[dataname][0])
                B[dataname].append(montant_benef[dataname][1])

        # build items list
        items = [(self.labels['var'], V)]

        for dataname in M:
            if M[dataname]:
                items.append((M_label[dataname], M[dataname]))
                items.append((B_label[dataname], B[dataname]))

        items.append((self.labels['entity'], U))
        aggr_frame = DataFrame.from_items(items)

        self.aggr_frame = None
        for code, label in self.labels.iteritems():
            try:
                col = aggr_frame[label]
                if self.aggr_frame is None:
                    self.aggr_frame = DataFrame(col)
                else:
                    self.aggr_frame = self.aggr_frame.join(col, how="outer")
            except:
                pass

    def compute_diff(self):
        '''
        Computes and adds relative differences
        '''

        dep = self.aggr_frame[self.labels['dep']]
        benef = self.aggr_frame[self.labels['benef']]

        if self.show_default:
            ref_dep_label, ref_benef_label = self.labels['dep_default'], self.labels['benef_default']
            if ref_dep_label not in self.aggr_frame:
                return
        elif self.show_real:
            ref_dep_label, ref_benef_label = self.labels['dep_real'], self.labels['benef_real']
        else:
            return

        ref_dep = self.aggr_frame[ref_dep_label]
        ref_benef = self.aggr_frame[ref_benef_label]

        self.aggr_frame[self.labels['dep_diff_rel']] = (dep - ref_dep) / abs(ref_dep)
        self.aggr_frame[self.labels['benef_diff_rel']] = (benef - ref_benef) / abs(ref_benef)
        self.aggr_frame[self.labels['dep_diff_abs']] = dep - ref_dep
        self.aggr_frame[self.labels['benef_diff_abs']] = benef - ref_benef

    def compute_real(self):
        '''
        Adds administrative data to dataframe
        '''
        if self.totals_df is None:
            return
        A, B = [], []
        for var in self.varlist:
            # totals from administrative data
            if var in self.totals_df.index:
                A.append(self.totals_df.get_value(var, "amount"))
                B.append(self.totals_df.get_value(var, "benef"))
            else:
                A.append(nan)
                B.append(nan)
        self.aggr_frame[self.labels['dep_real']] = A
        self.aggr_frame[self.labels['benef_real']] = B

    def create_description(self):
        '''
        Creates a description dataframe
        '''
        now = datetime.now()
        return DataFrame([
            u'OpenFisca',
            u'Calculé le %s à %s' % (now.strftime('%d-%m-%Y'), now.strftime('%H:%M')),
            u'Système socio-fiscal au %s' % self.simulation.period.start,
            u"Données d'enquêtes de l'année %s" % str(self.simulation.input_table.survey_year),
            ])

    def get_aggregate(self, variable, filter_by = None):
        """
        Returns aggregate spending, and number of beneficiaries
        for the relevant entity level

        Parameters
        ----------
        variable : string
                   name of the variable aggregated according to its entity
        """
        simulation = self.simulation
        column_by_name = self.simulation.tax_benefit_system.column_by_name
        column = column_by_name[variable]
        weight_name = self.weight_column_name_by_entity_key_plural[column.entity_key_plural]
        filter_by_name = "{}_{}".format(filter_by, column.entity_key_plural)
        # amounts and beneficiaries from current data and default data if exists
        # Build weights for each entity
        data = DataFrame(
            {
                variable: simulation.calculate_add(variable),
                weight_name: simulation.calculate(weight_name),
                }
            )
        data_default = None

        datasets = {'data': data}
        if data_default is not None:
            datasets['default'] = data_default
        filter_indicator = True
        if filter_by:
            filtered_data = DataFrame(
                {
                    variable: simulation.calculate(variable),
                    weight_name: simulation.calculate(weight_name),
                    filter_by_name: simulation.calculate(filter_by_name),
                    }
                )
            data_default = None
            filter_indicator = filtered_data[filter_by_name]
        m_b = {}

        weight = data[weight_name] * filter_indicator
        for name, data in datasets.iteritems():
            amount = data[variable]
            benef = data[variable].values != 0
            try:
                total_amount = int(round(sum(amount * weight) / 10 ** 6))
            except:
                total_amount = nan
            try:
                total_benef = int(round(sum(benef * weight) / 10 ** 3))
            except:
                total_benef = nan

            m_b[name] = [total_amount, total_benef]

        return m_b

    def load_amounts_from_file(self, filename = None, year = None):
        '''
        Loads totals from files
        '''
        if year is None:
            year = self.year
        if filename is None:
            data_dir = DATA_DIR

        try:
            filename = os.path.join(data_dir, "amounts.h5")
            store = HDFStore(filename)

            df_a = store['amounts']
            df_b = store['benef']
            store.close()
            self.totals_df = DataFrame(data = {
                "amount": df_a[year] / 10 ** 6,
                "benef": df_b[year] / 1000,
                })
            row = DataFrame({'amount': nan, 'benef': nan}, index = ['logt'])
            self.totals_df = self.totals_df.append(row)

            # Add some aditionnals totals
            for col in ['amount', 'benef']:
                # Deals with logt
                logt = 0
                for var in ['apl', 'alf', 'als']:
                    logt += self.totals_df.get_value(var, col)
                self.totals_df.set_value('logt', col, logt)

                # Deals with rsa rmi
                rsa = 0
                for var in ['rmi', 'rsa']:
                    rsa += self.totals_df.get_value(var, col)
                self.totals_df.set_value('rsa', col, rsa)

                # Deals with irpp, csg, crds
                for var in ['irpp', 'csg', 'crds', 'cotsoc_noncontrib']:
                    if col in ['amount']:
                        val = - self.totals_df.get_value(var, col)
                        self.totals_df.set_value(var, col, val)
        except:
            #  raise Exception(" No administrative data available for year " + str(year))
            import warnings
            warnings.warn("No administrative data available for year %s in file %s" % (str(year), filename))
            self.totals_df = None
            return

    def save_table(self, directory = None, filename = None, table_format = None):
        '''
        Saves the table to some format
        '''
        now = datetime.now()
        if table_format is None:
            if filename is not None:
                extension = filename[-4:]
                if extension == '.xls':
                    table_format = 'xls'
                elif extension == '.csv':
                    table_format = 'csv'
            else:
                table_format = 'xls'

        if directory is None:
            directory = "."
        if filename is None:
            filename = 'Aggregates_%s.%s' % (now.strftime('%d-%m-%Y'), table_format)

        fname = os.path.join(directory, filename)

        try:
            df = self.aggr_frame
            if table_format == "xls":
                writer = ExcelWriter(str(fname))
                df.to_excel(writer, "aggregates", index= False, header= True)
                descr = self.create_description()
                descr.to_excel(writer, "description", index = False, header=False)
                writer.save()
            elif table_format == "csv":
                df.to_csv(fname, "aggregates", index= False, header = True)
        except Exception, e:
                raise Exception("Aggregates: Error saving file", str(e))
Example #7
0
class BuildLda:
    def __init__(self, print_list=True):
        # Create dictionary
        self.dictionary = Dictionary()
        self.topics = ['Topic {}'.format(i) for i in range(1,31)]
        self.print_list = print_list

    def build_object(self):
        self.build_model()
        self.transform_set()
        self.build_nearest_neighbours()

    def build_model(self):
        if self.print_list:
            print('Building LDA')
        strings = JobDescription.objects.values('url', 'body')

        data_samples = []
        seen_strings = set()
        for string in strings:
            if string['body'] not in seen_strings:
                seen_strings.add(string['body'])
                data_samples.append({'url': string['url'], 'string': self.dictionary.clean_string(string['body'])})

        self.data_samples = DataFrame(data_samples)

        n_features = 10000
        n_topics = 15
        n_top_words = 10
        max_iter = 40

        self.tf_vectorizer = CountVectorizer(max_features=n_features,
                                        stop_words='english')

        tf = self.tf_vectorizer.fit_transform(self.data_samples['string'])

        self.lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=max_iter,
                                        learning_method='online')

        self.lda.fit(tf)

        if self.print_list:
            print()
            print("\nTopics in LDA model:")
        tf_feature_names = self.tf_vectorizer.get_feature_names()
        self.create_word_topics(self.lda, tf_feature_names)
        if self.print_list:
            self.print_top_words(self.lda, tf_feature_names, n_top_words)

    def test_single_doc(self, string):
        data_samples = DataFrame([{'string': self.dictionary.clean_string(string)}])
        test = self.tf_vectorizer.transform(data_samples['string'])
        lda_result = self.lda.transform(test)
        top_tags = []
        return_value = {'lda_result': lda_result, 'tags': []}
        index_set = sorted(range(len(lda_result[0])), key=lambda i: lda_result[0][i], reverse=True)
        position = 0
        for index in index_set:
            return_value['tags'].append({'tag': self.topics[index], 'position': position, 'score': lda_result[0][index]})
            top_tags.append(self.topics[index])
            position += 1
        return return_value

    def transform_set(self):
        if self.print_list:
            print('Getting LDA Transformation')
        vectorizor_data = self.tf_vectorizer.transform(self.data_samples['string'])
        self.results = self.lda.transform(vectorizor_data)

    def build_nearest_neighbours(self):
        if self.print_list:
            print('Build Nearest Neighbours')
        self.nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(self.results)

    def get_neighbours(self, string, print=False):
        return_result = self.test_single_doc(string)
        return_result['distances'], return_result['indices'] = self.nbrs.kneighbors(return_result['lda_result'])

        if print:
            self.print_neighbours(return_result['indices'][0])
        return_result['neighbours'] = self.return_neighbours(return_result['indices'][0], return_result['distances'][0])

        return {'tags': return_result['tags'], 'neighbours': return_result['neighbours']}

    def print_neighbours(self, indices):
        print('Closest 10 jobs:')
        for indice in indices:
            url = self.data_samples.get_value(indice, 'url')
            print('http://www.seek.com.au%s' % url)

    def return_neighbours(self, indices, distances):
        return_value = []
        for index in range(len(indices)):
            url = self.data_samples.get_value(indices[index], 'url')
            return_value.append({'url': 'http://www.seek.com.au{}'.format(url), 'distance': distances[index]})
        return return_value

    def print_top_words(self, model, feature_names, n_top_words):
        for topic_idx, topic in enumerate(model.components_):
            print(self.topics[topic_idx]+": "+" ".join([feature_names[i]
                            for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

    def create_word_topics(self, model, feature_names):
        for topic_idx, topic in enumerate(model.components_):
            self.topics[topic_idx] = "_".join([feature_names[i] for i in topic.argsort()[:-3 - 1:-1]])
Example #8
0
    def fit(self, annotations):
        """

        Parameters
        ----------
        annotations : (Annotation, Annotation) iterator

        Returns
        -------


        """

        # possible_match[n, m] is the total possible match duration
        # when there are n A-tracks & m B-tracks
        possible_match = DataFrame()

        # actual_match[n, m] is the total actual match duration
        # when there are n A-tracks & m B-tracks
        actual_match = DataFrame()

        # overlap[n, m] is the total duration
        # when there are n A-tracks & m B-tracks
        overlap = DataFrame()

        for n, (A, B) in enumerate(annotations):

            assert isinstance(A, Annotation), "%r is not an Annotation" % A
            assert isinstance(B, Annotation), "%r is not an Annotation" % B
            if n == 0:
                self.modalityA = A.modality
                self.modalityB = B.modality
            else:
                assert A.modality == self.modalityA, \
                    "bad modality (%r, %r)" % (self.modalityA, A.modality)
                assert B.modality == self.modalityB, \
                    "bad modality (%r, %r)" % (self.modalityB, B.modality)
            assert A.uri == B.uri, \
                "resource mismatch (%r, %r)" % (A.uri, B.uri)

            timeline, a, b = self._AB2ab(A, B)

            for segment in timeline:

                duration = segment.duration

                # number of tracks
                atracks = a.tracks(segment)
                Na = len(atracks)
                btracks = b.tracks(segment)
                Nb = len(btracks)

                if Na == 0 or Nb == 0:
                    continue

                # number of matching tracks
                N = len(a.get_labels(segment) & b.get_labels(segment))

                # increment possible_match & actual_match
                try:
                    p_m = possible_match.get_value(Na, Nb)
                    a_m = actual_match.get_value(Na, Nb)
                    ovl = overlap.get_value(Na, Nb)
                except Exception, e:
                    p_m = 0.
                    a_m = 0.
                    ovl = 0.

                possible_match = possible_match.set_value(Na, Nb,
                                                          p_m + min(Na, Nb)*duration)
                actual_match = actual_match.set_value(Na, Nb,
                                                      a_m + N*duration)
                overlap = overlap.set_value(Na, Nb, ovl + duration)