def read_file_info(self, infname, n_paths, calc_adj_mi):
        paths = [None for _ in range(n_paths)]
        with opener('r')(infname) as csvfile:
            reader = csv.DictReader(csvfile)
            for line in reader:
                if line['partition'] == '':
                    raise Exception('ERROR null partition (one of the processes probably got passed zero sequences')  # shouldn't happen any more FLW
                uids = []
                for cluster in line['partition'].split(';'):
                    uids.append([unique_id for unique_id in cluster.split(':')])
                path_index = int(line['path_index'])
                if paths[path_index] is None:
                    paths[path_index] = ClusterPath(int(line['initial_path_index']))
                else:
                    assert paths[path_index].initial_path_index == int(line['initial_path_index'])
                n_procs = int(line['n_procs']) if 'n_procs' in line else 1
                logweight = float(line['logweight']) if 'logweight' in line else None
                adj_mi = -1
                if calc_adj_mi:
                    adj_mi = utils.mutual_information(uids, self.reco_info, debug=False) if self.reco_info is not None else -1
                paths[path_index].add_partition(uids, float(line['logprob']), n_procs=n_procs, logweight=logweight, adj_mi=adj_mi)

        for cp in paths:
            if cp is None:
                raise Exception('None type path read from %s' % infname)
            for ptn in cp.partitions:
                if len(ptn) == 0:
                    raise Exception('zero length partition read from %s' % infname)

        return paths
 def add_next_global_partition():
     global_partition = []
     global_logprob = 0.
     for ifile in range(len(fileinfos)):  # combine the first line in each file to make a global partition
         for cluster in fileinfos[ifile][ipath].partitions[0]:
             global_partition.append(list(cluster))
         global_logprob += fileinfos[ifile][ipath].logprobs[0]
     global_adj_mi = -1
     if calc_adj_mi:
         global_adj_mi = utils.mutual_information(global_partition, self.reco_info, debug=False) if self.reco_info is not None else -1
     self.paths[ipath].add_partition(global_partition, global_logprob, n_procs=len(fileinfos), logweight=0., adj_mi=global_adj_mi)  # don't know the logweight yet (or maybe at all!)
    def write_partitions(self, writer, is_data, reco_info, true_partition, smc_particles, path_index, n_to_write=None, calc_adj_mi=None):
        for ipart in self.get_partition_subset(n_partitions=n_to_write):
            part = self.partitions[ipart]
            cluster_str = ''
            bad_clusters = []  # inferred clusters that aren't really all from the same event
            for ic in range(len(part)):
                if ic > 0:
                    cluster_str += ';'
                cluster_str += ':'.join(part[ic])
                if not is_data:
                    same_event = utils.from_same_event(is_data, reco_info, part[ic])  # are all the sequences from the same event?
                    entire_cluster = True  # ... and if so, are they the entire true cluster?
                    if same_event:
                        reco_id = reco_info[part[ic][0]]['reco_id']  # they've all got the same reco_id then, so pick an aribtrary one
                        true_cluster = true_partition[reco_id]
                        for uid in true_cluster:
                            if uid not in part[ic]:
                                entire_cluster = False
                                break
                    else:
                        entire_cluster = False
                    if not same_event or not entire_cluster:
                        bad_clusters.append(':'.join(part[ic]))

            if len(bad_clusters) > 25:
                bad_clusters = ['too', 'long']
            row = {'logprob' : self.logprobs[ipart],
                   'n_clusters' : len(part),
                   'n_procs' : self.n_procs[ipart],
                   'clusters' : cluster_str}
            if smc_particles > 1:
                row['path_index'] = path_index
                row['logweight'] = self.logweights[ipart]
            if not is_data:
                if calc_adj_mi is None or self.adj_mis[ipart] != -1:  # if we don't want to write any adj mis, or if we already calculated it
                    row['adj_mi'] = self.adj_mis[ipart]
                else:
                    if calc_adj_mi == 'best' and ipart == self.i_best:  # only calculate adj_mi for the best partition
                        row['adj_mi'] = utils.mutual_information(part, reco_info)
                    else:
                        row['adj_mi'] = self.adj_mis[ipart]
                row['n_true_clusters'] = len(true_partition)
                row['bad_clusters'] = ';'.join(bad_clusters)
            writer.writerow(row)
Exemple #4
0
def preprocess(collab, work, edu, advs, prods, stop_words=[]):
    # drop rows with no collaborations
    data = collab[collab['Colaboracoes'] != 0]

    # drop work rows with missing
    # vals and join to running data
    work = drop_if_missing(work)
    data = data.join(work, how='inner')

    # coerce numerical types in edu and
    # drop rows with missing values,
    # except post-doc and specialization,
    # which can be NaN
    for col in edu.columns:
        if col in ('inicio', 'inicio.1', 'inicio.2', 'fim', 'fim.1', 'fim.2'):
            edu[col] = pd.to_numeric(edu[col], errors='coerce')
    for column in edu.columns:
        if column != 'pos-doutorado' and column != 'especializacao':
            edu = edu[~pd.isna(edu[column])]

    # join to running data
    data = data.join(edu, how='inner')

    # join advisees data to running data
    data = data.join(advs, how='inner')

    # remove rows with no scientific
    # production and join to running data
    prods = prods[(prods != 0).any(axis=1)]
    data = data.join(prods, how='inner')

    # since there is high variability in how users
    # specify places and courses in their CVs, we
    # cluster them  with LSA + K-Means

    # cluster places
    places = [col for col in data.columns if 'local' in col
              ] + ['Instituicao Atual']
    data = cluster_text(data,
                        columns=places,
                        n_clusters=3000,
                        stop_words=stop_words)

    # cluster higher education
    courses = [
        'doutorado', 'graduacao', 'especializacao', 'mestrado', 'pos-doutorado'
    ]
    data = cluster_text(data,
                        columns=courses,
                        n_clusters=500,
                        stop_words=stop_words)

    # compute collaborations probabilities
    collab = data['Colaboracoes']
    total = len(collab)
    collab_prob = [np.sum(collab == x) / total for x in np.unique(collab)]

    # compute mutual information between features
    # and discard those that are independent from
    # collaborations
    all_cols = []
    mis = []
    for column in sorted(data.columns):
        if column != 'Colaboracoes':
            # compute mutual information
            mi = utils.mutual_information(collab,
                                          data[column],
                                          X_marginal=collab_prob)

            all_cols.append(column)
            mis.append(mi)

            # discard independent features
            if np.isclose(mi, 0):
                data = data.drop(columns=column)

    return data, mis, all_cols