Ejemplo n.º 1
0
def run():
	# If prepared file does not exist yet, create it
	if not os.path.isfile(FREESURFER_FILE_PREP):
		# Load FreeSurfer features
		features = util.load_features(FREESURFER_FILE)
		# Perform age-matching
		features = prepare.match_ages(features, 'HC', 'SZ', age_diff=2)
		# Remove constant features
		features = prepare.remove_constant_features(features)
		# Normalize numerical features across subjects (excluding 'age')
		features = prepare.normalize_across_subjects(features, exclude=['age'])
		# Residualize features for age, gender and total intracranial volume
		features = prepare.residualize(features, ['age', 'gender', 'EstimatedTotalIntraCranialVol'])
		# Remove highly correlated features
		features = prepare.remove_correlated_features(features)
		# Remove certain columns
		features = prepare.remove_features(features, ['diagnosis', 'age', 'gender'])
		# Write prepared freesurfer features back to file
		util.save_features(FREESURFER_FILE_PREP, features)
	else:
		# Load prepared features
		features = util.load_features(FREESURFER_FILE_PREP)

	# Run DBSCAN on features
	dbscan = DBSCAN(epsilon=1.0, min_pts=5)
	dbscan.run(features)
def get_all_combinations_of_variables(category):
    # List for all the variables involved
#     random_variables = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
    random_variables = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    # Dictionary to store the variable values
    dict_variables = {}
    dict_values = {}
    
    features = load_features(category)
    
#     #Writing to disk
#     if category == "cursive":    
#         pickle.dump(features, open("../total_features_cursive.p", "wb"))
#     elif category == "printed":
#         pickle.dump(features, open("../total_features_printed.p", "wb"))    
#     
#     print len(features)
    # Generating all combinations of length 2 
    for random_var in range(0, len(random_variables) + 1):
        for variable_pair in itertools.combinations(random_variables, random_var):
            if len(variable_pair) == 2:
                dict_values = get_dict_values(features, variable_pair, category)
                dict_variables[variable_pair] = dict_values
                # print(variable_pair)
    
    return dict_variables
def get_all_combinations_of_variables(category):
    # List for all the variables involved
    #     random_variables = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
    random_variables = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    # Dictionary to store the variable values
    dict_variables = {}
    dict_values = {}

    features = load_features(category)

    #     #Writing to disk
    #     if category == "cursive":
    #         pickle.dump(features, open("../total_features_cursive.p", "wb"))
    #     elif category == "printed":
    #         pickle.dump(features, open("../total_features_printed.p", "wb"))
    #
    #     print len(features)
    # Generating all combinations of length 2
    for random_var in range(0, len(random_variables) + 1):
        for variable_pair in itertools.combinations(random_variables,
                                                    random_var):
            if len(variable_pair) == 2:
                dict_values = get_dict_values(features, variable_pair,
                                              category)
                dict_variables[variable_pair] = dict_values
                # print(variable_pair)

    return dict_variables
Ejemplo n.º 4
0
def translate_vcf(vcf_fname, reference, path, feature_names=None):
    import time
    start = time.time()
    try:
        vcf_dict = read_in_vcf(vcf_fname, ref_fasta(path))
    except:
        print "Loading input alignment failed!: {}".format(vcf_fname)
    end = time.time()
    print "Reading in VCF took {}".format(str(end-start))

    start = time.time()
    featN = np.array(feature_names)
    selected_features = load_features(reference, feature_names)
    print "Translating {} genes...".format(len(selected_features))
    end = time.time()
    print "Reading in genes took {}".format(str(end-start))

    ref = vcf_dict['reference']
    sequences = vcf_dict['sequences']

    prots = {}
    deleted = []
    notMult3 = []

    start = time.time()
    #if genes have no mutations across sequences, they are dropped here from further analysis
    #check that gene lengths are multiples of 3. The first occurance causes an error in
    #Biopython, but subsequent ones do not - so make it ourselves.
    for fname,feature in selected_features.items():
        if len(str(feature.extract( SeqRecord(seq=Seq(ref)) ).seq))%3 != 0:
            notMult3.append(fname)

        prot_dict = translate_vcf_feature(sequences, ref, feature)
        if prot_dict is not None:
            prots[fname] = prot_dict
        else:
            deleted.append(fname)

    end = time.time()
    print "Translations took {}".format(str(end-start))

    start = time.time()
    #print out VCF of proteins
    #in new augur, set compress depending on input file ending!
    write_VCF_translation(prots, translation_vcf_file(path), translation_ref_file(path), compress=False)
    end = time.time()
    print "Writing out VCF took {}".format(str(end-start))

    #warn of those that don't have a length mult of 3
    print "WARNING: These genes do not have lengths that are a multiple of 3!\n{}".format(str(notMult3))

    #print dropped genes to a text file
    if len(deleted) != 0:
        with open(dropped_genes(path), 'w') as the_file:
            for d in deleted:
                the_file.write(d+"\n")
        print "{} genes had no mutations and so will be excluded. Excluded genes can be found in {}".format(len(deleted), dropped_genes(path))

    return prots
Ejemplo n.º 5
0
def run_model(train_data,
              test_data,
              n_trees,
              submit_id,
              model,
              save_model=False):
    '''
  Trains a model of the specified type and size on the training data,
  then predicts on the test data and writes out a submission.
  
  Args:
    train_data - bare training feature set name without path or extension 
    test_data - bare test feature set name without path or extension
    n_trees - number of trees to use in model
    submit_id - the result is written as submissions/submission_<submit_id>.csv
    model - a string...either 'rf' or 'extra'
    save_model - default False. If true, use joblib to dump the model at:
      paths.MODELS/<submit_id>_model.job

  Writes:
    A submission at paths.SUBMIT/submisssion_<submit_id>.csv
  '''
    start = datetime.now()
    train = util.load_features(train_data)
    drops = util.get_drop_cols(train)
    train.drop(drops, axis=1, inplace=True)
    print 'training set size: (%d, %d)' % train.shape
    print 'Training...'
    if model == 'rf':
        model = train_rf(train, n_trees)
    else:
        model = train_extra_trees(train, n_trees)
    if save_model:
        model_path = os.path.join(paths.MODELS, submit_id + '_model.job')
        joblib.dump(model, model_path)
    del train
    print 'Predicting...'
    test = util.load_features(test_data)
    test.drop(drops, axis=1, inplace=True)
    print 'test set size: (%d, %d)' % test.shape
    result = predict(model, test)
    submission_name = 'submission_%s.csv' % str(submit_id)
    submission = os.path.join(paths.SUBMIT, submission_name)
    result.to_csv(submission, index=False)
    finish = datetime.now()
    print 'Run finished: %d sec.' % (finish - start).seconds
Ejemplo n.º 6
0
def main():
    dataset = None
    if len(sys.argv) > 1:
        dataset = sys.argv[1]
    metadata = util.get_metadata((dataset + "_metadata") if dataset else None)
    
    mfcc = dict(zip([metadata[i][0] for i in range(1, len(metadata))], util.load_features((dataset + "_features") if dataset else None)))

    # Load pyAudioAnalysis features
    with open("F", "rb") as f:
        feats, files = pickle.load(f, encoding="latin1")
    files = [f.split(".")[0].split("XC")[-1] for f in files]
    F = dict(zip(files, feats))
    full_dataset = True
    for item in metadata[1:]:
        if item[0] not in F:
            full_dataset = False
    X2, X3 = [], []
    if full_dataset:
        X3 = [np.concatenate((F[item[0]], mfcc[item[0]]), axis=0) for item in metadata[1:]]
        X2 = [F[item[0]] for item in metadata[1:]]
    X1 = [mfcc[item[0]] for item in metadata[1:]]

    for X in [X1, X2]:
        NUM_RUNS = 50
        Y = util.load_labels((dataset + "_metadata") if dataset else None)
        samples = range(len(X))#range(1, len(X), 12)#random.sample(range(len(X)), 25)
        samps = samples#range(len(X))#samples 
        x = [X[i] for i in samps]
        y = [Y[i] for i in samples]
        N_ESTIMATORS = 20
        avg_mat = None 

        for run in range(NUM_RUNS): 
            clf = RandomForestClassifier(n_estimators=N_ESTIMATORS, max_features=20, oob_score=True).fit(X, Y)
            similarity = dict()
            for dt in clf.estimators_:
                leaves = dt.apply(X)
                for i in samps:
                    for j in samps:
                        if leaves[i] == leaves[j]:
                            similarity[(i,j)] = similarity.get((i,j), 0) + 1

            mat = np.array([[(1.0 - similarity.get((i,j), 0)/N_ESTIMATORS)**2 for j in samples] for i in samples])
            mat = squareform(mat)
            if avg_mat is None:
                avg_mat = mat
            else:
                avg_mat = np.add(avg_mat, mat)  
        avg_mat = avg_mat / NUM_RUNS
        linkage_matrix = linkage(avg_mat, "single")
        matplotlib.rcParams['lines.linewidth'] = 2.5
        dendrogram(linkage_matrix, color_threshold=0.8, labels=y, show_leaf_counts=True)
        plt.xlabel("label")
        plt.ylabel("distance")
        plt.show()
Ejemplo n.º 7
0
def run():

    for center in CONFIG.keys():

        # Load meta data and give it a new index based on measurement and subject ID
        meta_data_file_path = CONFIG[center]['meta']
        meta_data = pd.read_excel(meta_data_file_path, header=3)
        index = []
        for i in range(len(meta_data.index)):
            mid = meta_data.iloc[i][meta_data.columns[0]]
            sid = meta_data.iloc[i][meta_data.columns[1]]
            index.append('{}_{}_sMRI'.format(sid, mid))
        meta_data['id'] = pd.Series(index)
        meta_data.set_index('id', drop=True, inplace=True)

        # Load feature data
        features_file_path = CONFIG[center]['features']
        features = util.load_features(features_file_path, index_col='MRid')

        try:
            # Select rows in meta data corresponding to subject IDs in feature data.
            # Currently, there seems to be something wrong with the CIMH data, that
            # is, there's no overlap in subject IDs at all...
            # TODO: Wait for Emanuel to explain
            meta_data = meta_data.loc[features.index]
        except KeyError as e:
            print(
                'Subject IDs feature data do not match meta data {}'.format(e))
            continue

        meta_data = meta_data[meta_data['Gender [m/f]'].notnull()]

        # Convert gender values to standardized format
        for idx in meta_data.index:
            gender = meta_data.loc[idx]['Gender [m/f]']
            meta_data.set_value(idx, 'Gender [m/f]', to_gender(gender))

        # Add columns to original feature data
        features['Center'] = center
        features['Age'] = meta_data['Age [years]']
        features['Gender'] = meta_data['Gender [m/f]']
        features['Diagnosis'] = meta_data['Diagnosis']
        CONFIG[center]['features_ext'] = features

    # Concatenate feature data sets
    features = pd.concat([
        CONFIG['CIMH']['features_ext'],
        CONFIG['UIO']['features_ext'],
        CONFIG['UNIBA']['features_ext'],
        CONFIG['UNICH']['features_ext'],
    ])

    # Save concatenated feature data back to CSV file
    util.save_features(OUTPUT_FILE, features, index_label='MRid')
Ejemplo n.º 8
0
def translate(aln_fname, reference, feature_names, name_func):
    try:
        aln = AlignIO.read(aln_fname, 'fasta')
    except:
        print("Loading input alignment failed!:", aln_fname)

    selected_features = load_features(reference, feature_names)

    for fname, feature in selected_features.items():
        translation = translate_feature(aln, feature)
        AlignIO.write(translation, name_func(fname), 'fasta')
Ejemplo n.º 9
0
def run_model(train_data, test_data, n_trees, submit_id, model, save_model=False):
  '''
  Trains a model of the specified type and size on the training data,
  then predicts on the test data and writes out a submission.
  
  Args:
    train_data - bare training feature set name without path or extension 
    test_data - bare test feature set name without path or extension
    n_trees - number of trees to use in model
    submit_id - the result is written as submissions/submission_<submit_id>.csv
    model - a string...either 'rf' or 'extra'
    save_model - default False. If true, use joblib to dump the model at:
      paths.MODELS/<submit_id>_model.job

  Writes:
    A submission at paths.SUBMIT/submisssion_<submit_id>.csv
  '''
  start = datetime.now()
  train = util.load_features(train_data)
  drops = util.get_drop_cols(train)
  train.drop(drops, axis=1, inplace=True)
  print 'training set size: (%d, %d)' % train.shape
  print 'Training...'
  if model == 'rf':
    model = train_rf(train, n_trees)
  else:
    model = train_extra_trees(train, n_trees)
  if save_model:
    model_path = os.path.join(paths.MODELS, submit_id + '_model.job')
    joblib.dump(model, model_path)
  del train
  print 'Predicting...'
  test = util.load_features(test_data)
  test.drop(drops, axis=1, inplace=True)
  print 'test set size: (%d, %d)' % test.shape
  result = predict(model, test)
  submission_name = 'submission_%s.csv' % str(submit_id)
  submission = os.path.join(paths.SUBMIT, submission_name)
  result.to_csv(submission, index=False)
  finish = datetime.now()
  print 'Run finished: %d sec.' % (finish - start).seconds
Ejemplo n.º 10
0
def run():

    for center in CONFIG.keys():

        # Load meta data and give it a new index based on measurement and subject ID
        meta_data_file_path = CONFIG[center]['meta']
        meta_data = pd.read_excel(meta_data_file_path, header=3)
        index = []
        for i in range(len(meta_data.index)):
            mid = meta_data.iloc[i][meta_data.columns[0]]
            sid = meta_data.iloc[i][meta_data.columns[1]]
            index.append('{}_{}_sMRI'.format(sid, mid))
        meta_data['id'] = pd.Series(index)
        meta_data.set_index('id', drop=True, inplace=True)

        # Load feature data
        features_file_path = CONFIG[center]['features']
        features = util.load_features(features_file_path, index_col='MRid')

        try:
            # Select rows in meta data corresponding to subject IDs in feature data.
            # Currently, there seems to be something wrong with the CIMH data, that
            # is, there's no overlap in subject IDs at all...
            # TODO: Wait for Emanuel to explain
            meta_data = meta_data.loc[features.index]
        except KeyError as e:
            print('Subject IDs feature data do not match meta data {}'.format(e))
            continue

        meta_data = meta_data[meta_data['Gender [m/f]'].notnull()]

        # Convert gender values to standardized format
        for idx in meta_data.index:
            gender = meta_data.loc[idx]['Gender [m/f]']
            meta_data.set_value(idx, 'Gender [m/f]', to_gender(gender))

        # Add columns to original feature data
        features['Center'] = center
        features['Age'] = meta_data['Age [years]']
        features['Gender'] = meta_data['Gender [m/f]']
        features['Diagnosis'] = meta_data['Diagnosis']
        CONFIG[center]['features_ext'] = features

    # Concatenate feature data sets
    features = pd.concat([
        CONFIG['CIMH']['features_ext'],
        CONFIG['UIO']['features_ext'],
        CONFIG['UNIBA']['features_ext'],
        CONFIG['UNICH']['features_ext'],
    ])

    # Save concatenated feature data back to CSV file
    util.save_features(OUTPUT_FILE, features, index_label='MRid')
Ejemplo n.º 11
0
def export_diversity(path, prefix, reference, indent=None):
    '''
    write the alignment entropy of each alignment (nucleotide and translations) to file
    '''
    genes = load_features(reference)
    entropy_json = {}
    for feat, aln_fname in get_genes_and_alignments(path, tree=False):
        entropy = diversity_statistics(aln_fname, nuc=feat == 'nuc')
        S = [max(0, round(x, 4)) for x in entropy]
        n = len(S)
        if feat == 'nuc':
            entropy_json[feat] = {
                'pos': range(0, n),
                'codon': [x // 3 for x in range(0, n)],
                'val': S
            }
        elif feat in genes:
            entropy_json[feat] = {
                'pos': [x for x in genes[feat]][::3],
                'codon': range(n),
                'val': S
            }
    write_json(entropy_json, diversity_json(path, prefix), indent=indent)
Ejemplo n.º 12
0
def main():
    dataset = None
    if len(sys.argv) > 1:
        dataset = sys.argv[1]
    metadata = util.get_metadata((dataset + "_metadata") if dataset else None)
    
    mfcc = dict(zip([metadata[i][0] for i in range(1, len(metadata))], util.load_features((dataset + "_features") if dataset else None)))
    feats, files = None,None
    with open("F", "rb") as f:
        feats, files = pickle.load(f, encoding="latin1")
    files = [f.split(".")[0].split("XC")[-1] for f in files]
    F = dict(zip(files, feats))
    full_dataset = True
    for item in metadata[1:]:
        if item[0] not in F:
            full_dataset = False
    X2, X3 = [], [] 
    if full_dataset:
        X3 = [np.concatenate((F[item[0]], mfcc[item[0]]), axis=0) for item in metadata[1:]]
        X2 = [F[item[0]] for item in metadata[1:]] 
    X1 = [mfcc[item[0]] for item in metadata[1:]] 
    Y = util.load_labels((dataset + "_metadata") if dataset else None)#"bbsmd.csv")

    for X in [X1, X2] if full_dataset else [X1,]:
        print("------")
       
        classifiers = [ RandomForestClassifier(n_estimators=50, max_features=15, oob_score=True),
            KNeighborsClassifier(3),
            svm.SVC(kernel='linear', C=1),
            svm.SVC(gamma=2, C=1),
            GaussianNB()
        ]
        for clf in classifiers:
            scores = cross_val_score(clf, X, Y, cv=5)
            score = sum(scores)/len(scores)
            print(type(clf).__name__, "\t", score)
Ejemplo n.º 13
0
    except:
        print("Loading input alignment failed!:", aln_fname)

    selected_features = load_features(reference, feature_names)

    for fname, feature in selected_features.items():
        translation = translate_feature(aln, feature)
        AlignIO.write(translation, name_func(fname), 'fasta')


if __name__ == '__main__':
    parser = generic_argparse("Translate the nucleotide alignments")
    parser.add_argument('--reference',
                        required=True,
                        help='genbank file containing the annotation')
    parser.add_argument('--genes', nargs='+', help="genes to translate")
    args = parser.parse_args()

    path = args.path

    if not args.genes:
        genes = load_features(args.reference).keys()
    else:
        genes = args.genes

    for func in [tree_sequence_alignment, ref_alignment]:
        aln_fname = func(path, 'nuc')
        if os.path.isfile(aln_fname):
            translate(aln_fname, args.reference, genes,
                      lambda x: func(path, x))
Ejemplo n.º 14
0
def export_metadata_json(T, path, prefix, reference, isvcf=False, indent=1):
    print("Writing out metaprocess")
    mjson = {}

    mjson["virus_count"] = T.count_terminals()
    from datetime import date
    mjson["updated"] = date.today().strftime('%Y-%m-%d')
    mjson["author_info"] = {
        "?": {
            "paper_url": "?",
            "journal": "?",
            "title": "?",
            "n": 1
        }
    }
    mjson["seq_author_map"] = {}

    from collections import defaultdict
    cmaps = defaultdict(list)
    with open(color_maps(path), 'r') as cfile:
        for line in cfile:
            try:
                trait, name, color = line.strip().split('\t')
            except:
                continue
            cmaps[trait].append((name, color))

    #if drug-resistance colours have been auto-generated, get these too
    import os.path
    if os.path.isfile(drm_color_maps(path)):
        with open(drm_color_maps(path), 'r') as cfile:
            for line in cfile:
                try:
                    trait, name, color = line.strip().split('\t')
                except:
                    continue
                cmaps[trait].append((name, color))

    mjson["color_options"] = {
        "gt": {
            "menuItem": "genotype",
            "type": "discrete",
            "legendTitle": "Genotype",
            "key": "genotype"
        },
        "num_date": {
            "menuItem": "date",
            "type": "continuous",
            "legendTitle": "Sampling date",
            "key": "num_date"
        }
    }
    for trait in cmaps:
        mjson["color_options"][trait] = {
            "menuItem": trait,
            "type": "discrete",
            "color_map": cmaps[trait],
            "legendTitle": trait,
            "key": trait
        }

    mjson["panels"] = ["tree", "map", "entropy"]
    mjson["title"] = "NextTB"
    mjson["maintainer"] = "Emma Hodcroft"
    mjson["geo"] = {}
    lat_long_defs = load_lat_long_defs()
    for geo_trait in ['region', "country", 'division']:
        mjson["geo"][geo_trait] = {}
        for n in T.find_clades():
            if geo_trait in n.attr:
                place = n.attr[geo_trait]
                if (place not in mjson["geo"][geo_trait]
                        and place in lat_long_defs):
                    mjson["geo"][geo_trait][place] = lat_long_defs[place]

    mjson["commit"] = "unknown"
    mjson["filters"] = ["country", "region", "division"]

    genes = load_features(reference)
    anno = {}
    for feat, aln_fname in get_genes_and_alignments(path, tree=False):
        if feat in genes:
            anno[feat] = {
                "start": int(genes[feat].location.start),
                "end": int(genes[feat].location.end),
                "strand": genes[feat].location.strand
            }

    if isvcf:
        #if vcf, there is no 'gene' called 'nuc' that will be read in
        #above, so manually include it here.
        from filenames import ref_fasta
        from Bio import SeqIO
        refSeq = SeqIO.parse(ref_fasta(path), format='fasta').next()
        anno['nuc'] = {"start": 1, "end": len(refSeq.seq), "strand": 1}

    mjson["annotations"] = anno
    write_json(mjson, meta_json(path, prefix), indent=indent)
Ejemplo n.º 15
0
def generate_matrix_with_label(raw_item_file_path, uipairs_features_file_path,
                               users_features_file_path,
                               items_features_file_path,
                               categorys_features_file_path,
                               ucpairs_features_file_path, label_file_path,
                               begin_date, end_date):
    print "\n" + begin_date + "---" + end_date + "generating matrix with label..."

    users_column_index, users_features = load_features(
        users_features_file_path)
    items_column_index, items_features = load_features(
        items_features_file_path)
    categorys_column_index, categorys_features = load_features(
        categorys_features_file_path)
    uc_column_index, uc_features = load_uc_features(ucpairs_features_file_path)

    uipairs_features_file = open(uipairs_features_file_path)

    matrix_file_path = "./feature/" + begin_date + "-" + end_date + "-matrix-label.csv"
    matrix_file = open(matrix_file_path, 'w')

    # 加载item category字典
    item_category_dict = {}
    raw_item_file = open(raw_item_file_path)
    for line in raw_item_file:
        line_entrys = line.strip().split(delimiter)
        item_id = line_entrys[0]
        category_id = line_entrys[2]
        item_category_dict[item_id] = category_id
    raw_item_file.close()

    label_set = set()
    label_file = open(label_file_path)
    for line in label_file:
        line_entrys = line.split(delimiter)
        ui_id = delimiter.join(line_entrys[0:2])
        if line_entrys[2] == '4':
            label_set.add(ui_id)
    label_file.close()

    # 读取列名
    users_features_file = open(users_features_file_path)
    items_features_file = open(items_features_file_path)
    categorys_features_file = open(categorys_features_file_path)
    ucpairs_features_file = open(ucpairs_features_file_path)
    ui_column_name = uipairs_features_file.readline().split(delimiter)[:-1] + \
                     users_features_file.readline().split(delimiter)[1:-1] + \
                     items_features_file.readline().split(delimiter)[1:-1] + \
                     categorys_features_file.readline().split(delimiter)[1:-1] + \
                     ucpairs_features_file.readline().split(delimiter)[2:-1]

    # matrix_file.write(delimiter.join(ui_column_name) + ",label\n")
    users_features_file.close()
    items_features_file.close()
    categorys_features_file.close()
    ucpairs_features_file.close()

    for line in uipairs_features_file:
        line_entrys = line.split(delimiter)
        user_id = line_entrys[0]
        item_id = line_entrys[1]

        # matrix_line = delimiter.join(line_entrys[:-1]) + delimiter + \
        #               delimiter.join(users_features[user_id]) + delimiter + \
        #               delimiter.join(items_features[item_id]) + \
        #               delimiter.join(categorys_features[item_id]) + \
        #               delimiter.join(uc_features[ui_id]) + "\n"
        matrix_line = line_entrys[:-1] + \
                      users_features[user_id] + \
                      items_features[item_id] + \
                      categorys_features[item_category_dict[item_id]] + \
                      uc_features[item_category_dict[item_id]]

        label = "0"
        if (user_id + "," + item_id) in label_set:
            label = "1"
        matrix_file.write(delimiter.join(matrix_line) + "," + label + "\n")

    matrix_file.close()
    uipairs_features_file.close()
    print "generate matrix with label completed\n"

    return matrix_file_path


# path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))+'\\source'
# os.chdir(path)  # change dir to '~/files'
#
# uipairs_features_file_path = "./feature/2014-12-9-2014-12-18-uifeat.csv"
# users_features_file_path = "./feature/2014-12-9-2014-12-18-userfeat.csv"
# items_features_file_path = "./feature/2014-12-9-2014-12-18-itemfeat.csv"
#
# generate_matrix(uipairs_features_file_path, users_features_file_path, items_features_file_path, "2014-12-9", "2014-12-18")
Ejemplo n.º 16
0
def generate_matrix(raw_item_file_path, uipairs_features_file_path,
                    users_features_file_path, items_features_file_path,
                    categorys_features_file_path, ucpairs_features_file_path,
                    begin_date, end_date):
    print "\n" + begin_date + "---" + begin_date + "generating matrix..."

    users_column_index, users_features = load_features(
        users_features_file_path)
    items_column_index, items_features = load_features(
        items_features_file_path)
    categorys_column_index, categorys_features = load_features(
        categorys_features_file_path)
    uc_column_index, uc_features = load_uc_features(ucpairs_features_file_path)

    uipairs_features_file = open(uipairs_features_file_path)

    matrix_file_path = "./feature/" + begin_date + "-" + end_date + "-matrix.csv"
    matrix_file = open(matrix_file_path, 'w')

    # 加载item category字典
    item_category_dict = {}
    raw_item_file = open(raw_item_file_path)
    for line in raw_item_file:
        line_entrys = line.strip().split(delimiter)
        item_id = line_entrys[0]
        category_id = line_entrys[2]
        item_category_dict[item_id] = category_id
    raw_item_file.close()

    # 读取列名
    users_features_file = open(users_features_file_path)
    items_features_file = open(items_features_file_path)
    categorys_features_file = open(categorys_features_file_path)
    ucpairs_features_file = open(ucpairs_features_file_path)
    ui_column_name = uipairs_features_file.readline().split(delimiter)[:-1] + \
                     users_features_file.readline().split(delimiter)[1:-1] + \
                     items_features_file.readline().split(delimiter)[1:-1] + \
                     categorys_features_file.readline().split(delimiter)[1:-1] + \
                     ucpairs_features_file.readline().split(delimiter)[2:-1]

    matrix_file.write(delimiter.join(ui_column_name) + "\n")
    users_features_file.close()
    items_features_file.close()
    categorys_features_file.close()
    ucpairs_features_file.close()

    for line in uipairs_features_file:
        line_entrys = line.split(delimiter)
        user_id = line_entrys[0]
        item_id = line_entrys[1]

        # matrix_line = delimiter.join(line_entrys[:-1]) + delimiter + \
        #               delimiter.join(users_features[user_id]) + delimiter + \
        #               delimiter.join(items_features[item_id]) + \
        #               delimiter.join(categorys_features[item_id]) + \
        #               delimiter.join(uc_features[ui_id]) + "\n"
        matrix_line = line_entrys[:-1] + \
                      users_features[user_id] + \
                      items_features[item_id] + \
                      categorys_features[item_category_dict[item_id]] + \
                      uc_features[item_category_dict[item_id]]

        matrix_file.write(delimiter.join(matrix_line) + "\n")

    matrix_file.close()
    uipairs_features_file.close()
    print "generate matrix completed\n"

    return matrix_file_path
Ejemplo n.º 17
0
def load():

    return util.load_features(FILE_NAME, index_col='MRid')
Ejemplo n.º 18
0
def generate_matrix_with_label(raw_item_file_path, uipairs_features_file_path, users_features_file_path, items_features_file_path, categorys_features_file_path, ucpairs_features_file_path, label_file_path, begin_date, end_date):
    print "\n" + begin_date + "---" + end_date + "generating matrix with label..."

    users_column_index, users_features = load_features(users_features_file_path)
    items_column_index, items_features = load_features(items_features_file_path)
    categorys_column_index, categorys_features = load_features(categorys_features_file_path)
    uc_column_index, uc_features = load_uc_features(ucpairs_features_file_path)

    uipairs_features_file = open(uipairs_features_file_path)

    matrix_file_path = "./feature/" + begin_date + "-" + end_date + "-matrix-label.csv"
    matrix_file = open(matrix_file_path, 'w')

    # 加载item category字典
    item_category_dict = {}
    raw_item_file = open(raw_item_file_path)
    for line in raw_item_file:
        line_entrys = line.strip().split(delimiter)
        item_id = line_entrys[0]
        category_id = line_entrys[2]
        item_category_dict[item_id] = category_id
    raw_item_file.close()

    label_set = set()
    label_file = open(label_file_path)
    for line in label_file:
        line_entrys = line.split(delimiter)
        ui_id = delimiter.join(line_entrys[0:2])
        if line_entrys[2] == '4':
            label_set.add(ui_id)
    label_file.close()

    # 读取列名
    users_features_file = open(users_features_file_path)
    items_features_file = open(items_features_file_path)
    categorys_features_file = open(categorys_features_file_path)
    ucpairs_features_file = open(ucpairs_features_file_path)
    ui_column_name = uipairs_features_file.readline().split(delimiter)[:-1] + \
                     users_features_file.readline().split(delimiter)[1:-1] + \
                     items_features_file.readline().split(delimiter)[1:-1] + \
                     categorys_features_file.readline().split(delimiter)[1:-1] + \
                     ucpairs_features_file.readline().split(delimiter)[2:-1]

    # matrix_file.write(delimiter.join(ui_column_name) + ",label\n")
    users_features_file.close()
    items_features_file.close()
    categorys_features_file.close()
    ucpairs_features_file.close()

    for line in uipairs_features_file:
        line_entrys = line.split(delimiter)
        user_id = line_entrys[0]
        item_id = line_entrys[1]

        # matrix_line = delimiter.join(line_entrys[:-1]) + delimiter + \
        #               delimiter.join(users_features[user_id]) + delimiter + \
        #               delimiter.join(items_features[item_id]) + \
        #               delimiter.join(categorys_features[item_id]) + \
        #               delimiter.join(uc_features[ui_id]) + "\n"
        matrix_line = line_entrys[:-1] + \
                      users_features[user_id] + \
                      items_features[item_id] + \
                      categorys_features[item_category_dict[item_id]] + \
                      uc_features[item_category_dict[item_id]]

        label = "0"
        if (user_id+","+item_id) in label_set:
            label = "1"
        matrix_file.write(delimiter.join(matrix_line) + "," + label + "\n")

    matrix_file.close()
    uipairs_features_file.close()
    print "generate matrix with label completed\n"

    return matrix_file_path

# path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))+'\\source'
# os.chdir(path)  # change dir to '~/files'
#
# uipairs_features_file_path = "./feature/2014-12-9-2014-12-18-uifeat.csv"
# users_features_file_path = "./feature/2014-12-9-2014-12-18-userfeat.csv"
# items_features_file_path = "./feature/2014-12-9-2014-12-18-itemfeat.csv"
#
# generate_matrix(uipairs_features_file_path, users_features_file_path, items_features_file_path, "2014-12-9", "2014-12-18")
Ejemplo n.º 19
0
def main():
    dataset = None
    if len(sys.argv) > 1:
        dataset = sys.argv[1]
    metadata = util.get_metadata((dataset + "_metadata") if dataset else None)

    mfcc = dict(
        zip([metadata[i][0] for i in range(1, len(metadata))],
            util.load_features((dataset + "_features") if dataset else None)))

    # Load pyAudioAnalysis features
    with open("F", "rb") as f:
        feats, files = pickle.load(f, encoding="latin1")
    files = [f.split(".")[0].split("XC")[-1] for f in files]
    F = dict(zip(files, feats))
    full_dataset = True
    for item in metadata[1:]:
        if item[0] not in F:
            full_dataset = False
    X2, X3 = [], []
    if full_dataset:
        X3 = [
            np.concatenate((F[item[0]], mfcc[item[0]]), axis=0)
            for item in metadata[1:]
        ]
        X2 = [F[item[0]] for item in metadata[1:]]
    X1 = [mfcc[item[0]] for item in metadata[1:]]

    #X = util.load_features((dataset + "_features") if dataset else None)
    for X in [X1, X2]:
        labels = []
        avg_mat = None
        all_sims = dict()
        Y = util.load_labels((dataset + "_metadata") if dataset else None)
        samples = range(
            len(X))  #range(1, len(X), 12)#random.sample(range(len(X)), 25)
        samps = range(len(X))  #samples
        x = [X[i] for i in samps]
        y = [Y[i] for i in samples]

        N_ESTIMATORS = 80
        NUM_RUNS = 5

        for run in range(NUM_RUNS):
            clf = RandomForestClassifier(n_estimators=N_ESTIMATORS,
                                         max_features=25,
                                         oob_score=True).fit(X, Y)
            similarity = dict()
            for dt in clf.estimators_:
                leaves = dt.apply(X)
                for i in samps:
                    for j in samps:
                        if leaves[i] == leaves[j]:
                            similarity[(i, j)] = similarity.get(
                                (i, j), 0) + (1 / N_ESTIMATORS)

            species_similarity = dict()
            for i in samps:
                for j in samps:
                    species_similarity[(Y[i], Y[j])] = species_similarity.get(
                        (Y[i], Y[j]), 0) + similarity.get(
                            (i, j), 0)**2 / (Y.count(Y[i]) * Y.count(Y[j]))

            for k in species_similarity:
                species_similarity[k] = species_similarity[k]**(0.5)

            labels = clf.classes_
            for i in range(len(labels)):
                normal = species_similarity[(labels[i], labels[i])]
                for j in range(i, len(labels)):
                    k = labels[i], labels[j]
                    species_similarity[k] /= normal
                    species_similarity[(k[1], k[0])] = species_similarity[k]
                    all_sims[k] = all_sims.get(
                        k, 0) + species_similarity[k] / NUM_RUNS

            mat = np.array([[(1.0 - species_similarity.get((i, j), 0))**2
                             for j in labels] for i in labels])
            print(mat)
            mat = squareform(mat)
            if avg_mat is None:
                avg_mat = mat
            else:
                avg_mat = np.add(avg_mat, mat)
        avg_mat = avg_mat / NUM_RUNS
        print(avg_mat)
        for k in all_sims:
            if k[0] != k[1] and all_sims[k] > 0.1:
                print("{}\t{}\t{}".format(k[0], k[1], all_sims[k]))
        linkage_matrix = linkage(avg_mat, "single")
        matplotlib.rcParams['lines.linewidth'] = 2.5
        dendrogram(linkage_matrix,
                   color_threshold=0.65,
                   labels=labels,
                   show_leaf_counts=True)
        plt.xlabel("label")
        plt.ylabel("distance")
        plt.show()
Ejemplo n.º 20
0
#     #train_labels,_ = util.read_h5(os.path.join(prefix,output_dir,"train_selected_cnn_{}_label.h5".format(feature_name)))
#     train_features,train_labels = util.load_features(prefix,output_dir,feature_name,fold,"train")
#     test_features,test_labels = util.load_features(prefix,test_output_dir,feature_name,fold,"test")
#     #compute_features(train_features,train_labels,"fisher_{}.npy".format(fold),method="fisher")
#     load_feature_sets("fisher",feature_num=1000,padded=True,save=False)

# num of feature vs acc/f1
#for i in [10,20,50,100,1000]:
for i in [1000]:
    accs, mf1s, wf1s = [], [], []
    for j in [1, 2, 3]:
        fold = j
        feature_name = "embo{}_norm".format(fold)
        #model_name="train_combined{}_multiple_norm".format(fold)

        train_features, train_labels = util.load_features(
            prefix, output_dir, feature_name, fold, "train")
        test_features, test_labels = util.load_features(
            prefix, output_dir, feature_name, fold, "test")
        acc, mf1, wf1, conf = load_feature_sets("fisher",
                                                i,
                                                save=False,
                                                padded=False)
        if j == 1: confs = conf
        else: confs += conf
        accs.append(acc)
        mf1s.append(mf1)
        wf1s.append(wf1)
    print("mean acc", np.mean(accs), "std acc", np.std(accs))
    print("mean weighted F1 scores", np.mean(wf1s), "std weighted F1 scores",
          np.std(wf1s))
    print("mean macro F1 scores", np.mean(mf1s), "std macro F1 scores",
Ejemplo n.º 21
0
def PCA_analysis(feature_num=5,
                 fold=1,
                 method="mrmr",
                 cluster_method="one-vs-one"):
    all_idx = []
    feature_name = "combined{}_multiple_norm".format(fold)
    if cluster_method == "one-vs-one":
        for count, combo in enumerate(combinations(labels_dict.items(), 2)):
            score, idx = load_features("{}_{}_{}_{}.npy".format(
                method, combo[0][0], combo[1][0], fold))
            all_idx.extend(idx[:feature_num])
            #all_idx.extend(get_most_important_features(os.path.join(method,"{}_feature_importance_{}_{}.txt".format(method,combo[0][0],combo[1][0])),idx=True)[:feature_num])
    elif cluster_method == "one-vs-all":
        for key in labels_dict.keys():
            score, idx = load_features("{}_{}_{}.npy".format(
                method, key, fold))
            all_idx.extend(idx[:feature_num])
            #all_idx.extend(get_most_important_features(os.path.join(method,"{}_feature_importance_{}.txt".format(method,key)),idx=True)[:feature_num])
    elif cluster_method == "overlap":
        for key in labels_dict.keys():
            feature_dict = find_over_lap_features(method, key, True)
            for k, v in feature_dict.items():
                all_idx.extend(v[:feature_num])

    all_idx = list(set(all_idx))

    score, all_idx_overlap = load_features("overlap_at_least_3.npy")
    intersection_idx = list(set(all_idx_overlap).intersection(set(all_idx)))
    for i in intersection_idx:
        print(feature_names[i])

    train_features, train_labels = util.load_features(prefix, output_dir,
                                                      feature_name, fold,
                                                      "train")
    train_features_selected = train_features[:, all_idx]
    test_features, test_labels = util.load_features(prefix, output_dir,
                                                    feature_name, fold, "test")
    test_features_selected = test_features[:, all_idx]
    print("original", test_features_selected.shape)
    lda = LDA(n_components=None,
              priors=None,
              shrinkage=None,
              solver='svd',
              store_covariance=False,
              tol=0.0001)
    test_features_selected = lda.fit_transform(test_features_selected,
                                               test_labels)
    print("lda", test_features_selected.shape)
    #test_features_selected=PCA(n_components=2).fit_transform(test_features_selected)
    tf = TSNE(n_components=2,
              perplexity=30).fit_transform(test_features_selected)
    colors = ['r', 'darkgreen', 'y', 'c', 'b']
    curr_colors = np.asarray([colors[int(i)] for i in test_labels])

    selected_idx = get_num_samples(test_labels)
    selected_colors = curr_colors[selected_idx]
    plt.scatter(tf[selected_idx, 0],
                tf[selected_idx, 1],
                c=selected_colors,
                alpha=0.7)

    #plt.scatter(tf[:,0],tf[:,1],c=curr_colors,alpha=0.7)

    legend_elements = [
        Line2D([0], [0],
               marker='o',
               color='w',
               label='CRY',
               markerfacecolor='r',
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color='w',
               label='FUS',
               markerfacecolor='darkgreen',
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color='w',
               label='LAU',
               markerfacecolor='y',
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color='w',
               label='BAB',
               markerfacecolor='c',
               markersize=10),
        Line2D([0], [0],
               marker='o',
               color='w',
               label='SCR',
               markerfacecolor='b',
               markersize=10)
    ]
    plt.legend(handles=legend_elements, loc="upper right")
    #plt.show()
    #plt.savefig("overlap_at_least_3_reduced_bab_600_other_120.png")
    plt.savefig("{}/{}_top_{}_reduced_bab_600_other_120.png".format(
        method, cluster_method, feature_num))
    plt.close()
Ejemplo n.º 22
0
def generate_matrix(raw_item_file_path, uipairs_features_file_path, users_features_file_path, items_features_file_path, categorys_features_file_path, ucpairs_features_file_path, begin_date, end_date):
    print "\n" + begin_date + "---" + begin_date + "generating matrix..."

    users_column_index, users_features = load_features(users_features_file_path)
    items_column_index, items_features = load_features(items_features_file_path)
    categorys_column_index, categorys_features = load_features(categorys_features_file_path)
    uc_column_index, uc_features = load_uc_features(ucpairs_features_file_path)

    uipairs_features_file = open(uipairs_features_file_path)

    matrix_file_path = "./feature/" + begin_date + "-" + end_date + "-matrix.csv"
    matrix_file = open(matrix_file_path, 'w')

    # 加载item category字典
    item_category_dict = {}
    raw_item_file = open(raw_item_file_path)
    for line in raw_item_file:
        line_entrys = line.strip().split(delimiter)
        item_id = line_entrys[0]
        category_id = line_entrys[2]
        item_category_dict[item_id] = category_id
    raw_item_file.close()

    # 读取列名
    users_features_file = open(users_features_file_path)
    items_features_file = open(items_features_file_path)
    categorys_features_file = open(categorys_features_file_path)
    ucpairs_features_file = open(ucpairs_features_file_path)
    ui_column_name = uipairs_features_file.readline().split(delimiter)[:-1] + \
                     users_features_file.readline().split(delimiter)[1:-1] + \
                     items_features_file.readline().split(delimiter)[1:-1] + \
                     categorys_features_file.readline().split(delimiter)[1:-1] + \
                     ucpairs_features_file.readline().split(delimiter)[2:-1]

    matrix_file.write(delimiter.join(ui_column_name) + "\n")
    users_features_file.close()
    items_features_file.close()
    categorys_features_file.close()
    ucpairs_features_file.close()

    for line in uipairs_features_file:
        line_entrys = line.split(delimiter)
        user_id = line_entrys[0]
        item_id = line_entrys[1]

        # matrix_line = delimiter.join(line_entrys[:-1]) + delimiter + \
        #               delimiter.join(users_features[user_id]) + delimiter + \
        #               delimiter.join(items_features[item_id]) + \
        #               delimiter.join(categorys_features[item_id]) + \
        #               delimiter.join(uc_features[ui_id]) + "\n"
        matrix_line = line_entrys[:-1] + \
                      users_features[user_id] + \
                      items_features[item_id] + \
                      categorys_features[item_category_dict[item_id]] + \
                      uc_features[item_category_dict[item_id]]

        matrix_file.write(delimiter.join(matrix_line) + "\n")

    matrix_file.close()
    uipairs_features_file.close()
    print "generate matrix completed\n"

    return matrix_file_path