Esempio n. 1
0
    binary = args.binary
    n_jobs = args.jobs
    balanced = args.balanced
    bp = args.biological_process
    mf = args.molecular_function
    cc = args.cellular_component
    permute = args.permute
    scale = args.scale
    folder = args.output_folder
    file_suffix = args.output_file_suffix


    # ----------------------------- SETUP ----------------------------------- #
    date = str(datetime.now()).replace(" ", "-").replace(":", '-').replace('.', '-')
    log = open("tmp/training_log.txt", "w")
    dag = load_go_dag('data/gene_ontology.1_2.obo')

    if vectorizer_method not in ['count', 'tf-idf']:
        print('Vectorizer Method must select from: count | tf-idf')
        sys.exit(1)

    if folder:
        direc = 'results/{}-{}'.format(folder, date)
        su_make_dir(direc)
    else:
        direc = tempfile.mkdtemp(prefix='{}-{}-'.format(method, date), dir='results/')

    selection = []
    ontologies = []
    if pfam:
        selection.append('pfam')
Esempio n. 2
0
def build_data_frame(ppi_file, obo_file, accession_to_feature_file, induce, fill_na, cache, n_jobs):
    """
    Loads each tsv file containing a feature set (such as float similarity scores or accessions) into a pandas
    dataframe and then attempts to create binary vector/bag of words representations of textual accesion
    features. Finally combines each binary/numerical vector into a single feature vector along with it's label.

    @param ppi_file: Directory to look for feature files.
    @param obo_file: Path to obo file.
    @param accession_to_feature_file: Path to accession-feature map stored in tsv.
    @param induce: True to induce GO terms.
    @param fill_na: Value to fill NA with. Best to use np.NaN.
    @param cache: File to save dataframe to.

    @return: DataFrame.
    """
    print("Building dataframe...")

    dag = ontology.load_go_dag(obo_file)

    # Create the blank dataframe to which we will attach our data to.
    labels = get_labels_from_file('data/labels.tsv')

    od = Od({
        'uniprot': [],
        'uniprot_a': [],
        'uniprot_b': [],
        'go': [],
        'go_cc': [],
        'go_bp': [],
        'go_mf': [],
        'induced_go': [],
        'induced_go_cc': [],
        'induced_go_bp': [],
        'induced_go_mf': [],
        'ipr': [],
        'pfam': [],
        'sim': [],
        'label': []
    })
    columns = od.keys() + labels

    # This will be for quick accessing of data binary labels for BR methods.
    # initialise these label to a null value.
    for l in labels:
        od[l] = -1

    # Iterate through each ppi in the supplied file.
    fp = open(ppi_file, 'r')
    fp.readline() # assumes header exists for internal format

    def do_line(line):
        xs = line.strip().split('\t')
        p1 = xs[0].strip()
        p2 = xs[1].strip()
        reaction_type = xs[2].strip()
        reaction_types = [x.lower() for x in reaction_type.split(',')]

        # Maybe use resnik or something.
        cc_ss = float( xs[3].strip() )
        bp_ss = float( xs[4].strip() )
        mf_ss = float( xs[5].strip() )

        terms = compute_features([p1, p2], induce, accession_to_feature_file, fill_na, dag)

        od = Od({
            'uniprot': [(p1, p2)],
            'uniprot_a': [p1],
            'uniprot_b': [p2],
            'go': [terms['go']],
            'go_cc': [terms['go_cc']],
            'go_bp': [terms['go_bp']],
            'go_mf': [terms['go_mf']],
            'induced_go': [terms['induced_go']],
            'induced_go_cc': [terms['induced_go_cc']],
            'induced_go_bp': [terms['induced_go_bp']],
            'induced_go_mf': [terms['induced_go_mf']],
            'ipr': [terms['ipr']],
            'pfam': [terms['pfam']],
            'sim': [csr_matrix([cc_ss, bp_ss, mf_ss])],
            'label': [reaction_type]
        })

        # Iterate and check which labels are present in reaction_type.
        # Order of traversal is important here.
        for l in labels:
            if l.lower() in reaction_types:
                od[l] = 1
            else:
                od[l] = 0

        # Concatenate the dataframes
        df_new = pd.DataFrame(
            od, dtype='object',
            columns=columns
        )
        return df_new

    try:
         df_rows = parallel_map(do_line, fp, n_jobs=n_jobs)
    except KeyboardInterrupt:
        sys.exit(0)

    df = pd.concat(df_rows, ignore_index=True)
    df = df.reset_index(); del df['index']
    pickle.dump(df, open(cache, 'w'))
    return df
Esempio n. 3
0
def compute_ss(ppi_tuples):
    r_file_in = tempfile.mktemp(suffix='.tsv', prefix='r_in_', dir='tmp')
    r_file_out = tempfile.mktemp(suffix='.tsv', prefix='r_out_', dir='tmp')
    dag = ontology.load_go_dag(OBO_FILE)
    feature_df = load_data_frame(ACCESSION_FEATURES_FILE, fill_na=np.NaN)

    # Write the three seperate GO columns to the r_input_file
    fp = open(r_file_in, 'w')
    fp.write("p1\tp2\tp1_go_cc\tp2_go_cc\tp1_go_bp\tp2_go_bp\tp1_go_mf\tp2_go_mf\n")
    for p1, p2 in ppi_tuples:
        p1_go = get_feature_for_accession(feature_df, p1, 'uniprot', 'go')
        p2_go = get_feature_for_accession(feature_df, p2, 'uniprot', 'go')

        # Separate the namespaces in the go terms.
        p1_go_cc = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'cellular_component', p1_go))
        for p in p1_go_cc:
            assert ontology.id_to_node(p, dag).namespace == 'cellular_component'

        p2_go_cc = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'cellular_component', p2_go))
        for p in p2_go_cc:
            assert ontology.id_to_node(p, dag).namespace == 'cellular_component'

        p1_go_bp = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'biological_process', p1_go))
        for p in p1_go_bp:
            assert ontology.id_to_node(p, dag).namespace == 'biological_process'

        p2_go_bp = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'biological_process', p2_go))
        for p in p2_go_bp:
            assert ontology.id_to_node(p, dag).namespace == 'biological_process'

        p1_go_mf = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'molecular_function', p1_go))
        for p in p1_go_mf:
            assert ontology.id_to_node(p, dag).namespace == 'molecular_function'
        p2_go_mf = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'molecular_function', p2_go))
        for p in p2_go_mf:
            assert ontology.id_to_node(p, dag).namespace == 'molecular_function'

        fp.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n".format(
                        p1, p2,
                        ','.join(p1_go_cc), ','.join(p2_go_cc),
                        ','.join(p1_go_bp), ','.join(p2_go_bp),
                        ','.join(p1_go_mf), ','.join(p2_go_mf)
                    )
                )
    fp.close()

    # Run R script then collect output from tmp file
    args = [
        'Rscript',
        'semantic_sim.r',
        '--file={}'.format(r_file_in),
        '--out={}'.format(r_file_out)
    ]
    proc = subprocess.Popen(args)
    proc.wait()

    # Parse the r_output into a list
    sims_tuple = []
    with open(r_file_out, 'r') as fp:
        for line in fp:
            xs = line.strip().split('\t')
            p1, p2, cc_ss, bp_ss, mf_ss = xs
            sims_tuple.append((p1, p2, cc_ss, bp_ss, mf_ss))
    fp.close()
    os.remove(r_file_in)
    os.remove(r_file_out)

    return sims_tuple
Esempio n. 4
0
def depths(df, column):
    dag = ontology.load_go_dag('data/gene_ontology.1_2.obo')
    sublists = [x.split(',') for x in df[column].values]
    mean_depths = map(lambda sublist: np.mean([dag[x].depth for x in sublist if 'go' in x.lower()]), sublists)
    std_depths = map(lambda sublist: np.std([dag[x].depth for x in sublist if 'go' in x.lower()]), sublists)
    return mean_depths, std_depths