Beispiel #1
0
def mirnaid_fix(fname: str):
    d = {
        "mouse": "mmu",
        "human": "hsa",
        "elegans": "cel",
        "cattle": "bta",
        "fly": "aga"
    }
    prefix = None
    for k, v in d.items():
        if k in fname:
            prefix = v
    if prefix is None:
        raise Exception("unrecognized mirbase prefix")

    mirbase_df: DataFrame = pd.read_csv(MIRBASE_FILE).query("prefix==@prefix")
    mirbase_df.sort_values(by="version", ascending=False, inplace=True)
    mirbase_df.drop_duplicates("miRNA sequence", keep="first", inplace=True)

    fin_full_path = READ_PATH / fname
    fout_full_path = MIRNA_SEQ_PATH / fname

    d: DataFrame = read_csv(fin_full_path)

    join_df = d.merge(mirbase_df,
                      how="left",
                      left_on="miRNA sequence",
                      right_on="miRNA sequence")
    d['miRNA ID'] = join_df['miRNA ID_y']
    to_csv(d, fout_full_path)
Beispiel #2
0
def get_site_from_extended_site(fin: str, fout: str):
    def calc_chimera_start(seq: str, subseq: str) -> int:
        try:
            if seq.find(subseq) == -1:
                return -1
            return seq.find(subseq) + 1
        except AttributeError:
            return -1

    def calc_chimera_end(chimera_start: int, seq_extended: str) -> int:
        if chimera_start == -1:
            return -1
        return chimera_start + len(seq_extended) - 1 - HUMAN_SITE_EXTENDED_LEN

    logger.info(f"Insert site to {fin}")
    df: DataFrame = read_csv(Path(fin))
    df["chimera_start"] = df.apply(func=get_wrapper(calc_chimera_start,
                                                    'region sequence',
                                                    'mRNA_seq_extended'),
                                   axis=1)
    df["chimera_end"] = df.apply(func=get_wrapper(calc_chimera_end,
                                                  'chimera_start',
                                                  'mRNA_seq_extended'),
                                 axis=1)

    df["site"] = df.apply(func=get_wrapper(get_subsequence_by_coordinates,
                                           "region sequence",
                                           "chimera_start",
                                           "chimera_end",
                                           extra_chars=SITE_EXTRA_CHARS),
                          axis=1)

    to_csv(df, Path(fout))
    logger.info(f"finish the site sequence insertion to {fin}")
Beispiel #3
0
def human_mapping_merge_by_name(fin: Path, fout: Path):
    def verify_sequence(seq: str, subseq: str) -> bool:
        try:
            return seq.find(subseq) != -1
        except AttributeError:
            return False

    in_df: DataFrame = read_csv(fin)
    in_df["join_key"] = in_df["mRNA ID"].apply(
        lambda x: "|".join(x.split("_")[0:2]))
    mRNA_df = concatenate_biomart_df("human")

    in_df = in_df.merge(mRNA_df,
                        how="left",
                        left_on=["region", "join_key"],
                        right_on=["region", "ID"])

    in_df = in_df.rename(columns={"sequence": "region sequence"})
    in_df = in_df[[
        'key', 'paper name', 'miRNA ID', 'miRNA sequence', 'mRNA ID',
        'mRNA_seq_extended', 'region', 'region_sequence', 'mRNA_start',
        'mRNA_end_extended'
    ]]

    in_df["join_ok"] = in_df.apply(func=get_wrapper(verify_sequence,
                                                    'region sequence',
                                                    'mRNA_seq_extended'),
                                   axis=1)

    to_csv(in_df, fout)
Beispiel #4
0
def get_model_output_10_classes(filename):
    with tf.gfile.GFile(filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())

    with tf.Graph().as_default() as graph:
        tensors_and_ops = tf.import_graph_def(graph_def, name='')
        with tf.Session(graph=graph) as sess:
            output_op = graph.get_operation_by_name('Minimum_3').outputs[0]
            xs = []
            ys = []
            filename_list = []
            for idx, filename in enumerate(glob.glob(os.path.join(Constants.AUDIO_DATA_FOLDER, '*wav*', '*.wav'))):
                name = filename.split('/')[-2:]
                if int(name[-1].strip('.wav')) < 1000:
                    continue
                y = name[-1].strip('.wav')
                y = str(int(y) - 1000)
                name = '/'.join(name)
                name = name.replace('.wav', '')
                filename_list.append(name)
                if idx % 50 == 0:
                    print(name)
                fs, audio = wav.read(filename)
                x = audioToInputVector(audio, fs, N_FEATURES, N_CONTEXT)
                out = sess.run(output_op, {'input_node:0': [
                               x], 'input_lengths:0': [len(x)]})
                xs.append(out)
                ys.append(y)
            xs = fix_seq_length(xs, length=20)
            xs = apply_pca(xs, n_components=25)
            xs = np.array([np.ravel(x) for x in xs])
            to_csv(xs, ys, os.path.join(Constants.DATA_FOLDER, 'audio10classes.csv'),
                   filename_list=filename_list)
Beispiel #5
0
def feature_extraction(fin: str, fout: str):
    in_df: DataFrame = read_csv(Path(fin))
    valid_df = in_df.query("valid_row & duplex_valid=='True'")
    feature_df = df_feature_extractor(valid_df)
    result = pd.merge(left=in_df,
                      right=feature_df,
                      left_index=True,
                      right_index=True,
                      how='left')
    to_csv(result, Path(fout))
Beispiel #6
0
def fast_blast_file(fin: Path, fout: Path, db_title: str):
    logger.info(f"fast blast file {fin} against {db_title}")
    in_df: DataFrame = read_csv(fin)
    seq_file = BIOMART_DATA_PATH / f"{db_title}.csv"
    df_contains_db_title = partial(df_contains, df=pd.read_csv(seq_file))

    in_df["blast sequence"] = in_df.apply(func=get_wrapper(
        df_contains_db_title, "site"),
                                          axis=1)
    to_csv(in_df, fout)
Beispiel #7
0
def rna_insertion(fin_full_path: Path, fout_full_path: Path,
                  rna_df: DataFrame):
    logger.info(f"Insert rna sequence to {fin_full_path}")
    df: DataFrame = read_csv(fin_full_path)
    join_df = df.merge(rna_df,
                       how="left",
                       left_on="mRNA ID",
                       right_on="mRNA ID",
                       validate="many_to_one")
    to_csv(join_df, fout_full_path)
    logger.info(f"Finish the rna sequence insertion to {fin_full_path}")
Beispiel #8
0
def insert_site_by_coordinates(fin: str, fout: str):
    logger.info(f"Insert site to {fin}")
    df: DataFrame = read_csv(Path(fin))
    df["site"] = df.apply(func=get_wrapper(get_subsequence_by_coordinates,
                                           "mRNA sequence",
                                           "chimera_start",
                                           "chimera_end",
                                           extra_chars=SITE_EXTRA_CHARS),
                          axis=1)

    to_csv(df, Path(fout))
    logger.info(f"finish the site sequence insertion to {fin}")
Beispiel #9
0
def blast_file(fin: Path, fout: Path, db_title: str):

    logger.info(f"blast file {fin} against {db_title}")
    in_df: DataFrame = read_csv(fin)
    blastn_df: DataFrame = in_df.apply(func=get_wrapper(run_blastn,
                                                        "site",
                                                        db_title=db_title),
                                       axis=1)
    result = pd.concat([in_df, blastn_df], axis=1)

    # in_df["blast region"] = in_df["blast sequence"].apply(lambda x: "" if np.isnan(x) else db_title)

    to_csv(result, fout)
Beispiel #10
0
def df_col_rename(fname: Path):
    df: DataFrame = read_csv(fname)
    df.rename(
        columns={
            'region_sequence': 'sequence',
            # 'chimera_start' : 'start',
            # 'chimera_end' : 'end'
        },
        inplace=True)
    print(df.columns)
    df.insert(0, 'region count', np.nan)
    df.insert(0, 'identity', np.nan)
    df.insert(0, 'coverage', np.nan)
    to_csv(df, fname)
Beispiel #11
0
def gambiae_run(fin: str, fout: str):
    df: DataFrame = add_gambiae_region_information(Path(fin))
    df = insert_gambiae_region(df)
    df = insert_gambiae_region_sequence(df)
    df["start"] = df.apply(lambda row: row['chimera_start'] - row[
        'mRNA sequence'].find(row['region_sequence']),
                           axis=1)
    df["end"] = df.apply(lambda row: row['chimera_end'] - row['mRNA sequence'].
                         find(row['region_sequence']),
                         axis=1)
    df.rename(columns={"TRANSCRIPT_ID": ",Gene_ID"}, inplace=True)
    cols = [c for c in df.columns if c not in GAMBIAE_INFORMATION_USECOLS]

    to_csv(df[cols], Path(fout))
Beispiel #12
0
def duplex(method: str, fin: str, fout: str):
    duplex_cls: Duplex = DUPLEX_DICT[method]
    logger.info(f"{method} do_duplex to {fin}")
    in_df: DataFrame = read_csv(Path(fin))
    # [in_df["miRNA sequence"].notnull() & in_df.site.notnull()]
    duplex_df = in_df.query("valid_row").apply(func=get_wrapper(
        do_duplex, "miRNA sequence", "site", cls=duplex_cls),
        axis=1)


    result = pd.merge(left=in_df, right=duplex_df, left_index=True, right_index=True, how='left')

    result["duplex_method"] = method
    to_csv(result, Path(fout))
Beispiel #13
0
def insert_site_from_chromosome(fin: str, fout: str, chr_dir: str):
    logger.info(f"Insert site from chromosome to {fin}")
    df: DataFrame = read_csv(Path(fin))
    df["site"] = df.apply(func=get_wrapper(extract_seq_from_chromosome,
                                           'chr',
                                           'start',
                                           'end',
                                           'strand',
                                           directory=Path(chr_dir)),
                          axis=1)
    df["site"] = df["site"].apply(lambda x: x.upper())

    to_csv(df, Path(fout))
    logger.info(f"finish the site sequence insertion to {fin}")
Beispiel #14
0
def mirna_seq_insertion(fname: str):
    logger.info(f"Insert mirna sequence to {fname}")

    fin_full_path = READ_PATH / fname
    fout_full_path = MIRNA_SEQ_PATH / fname

    df: DataFrame = read_csv(fin_full_path)
    df.drop(columns=["miRNA sequence"], inplace=True,
            errors='ignore')  #drop the col we want to add via the join
    mirbase_df: DataFrame = pd.read_csv(MIRBASE_FILE,
                                        usecols=["miRNA ID", "miRNA sequence"])
    join_df = df.merge(mirbase_df,
                       how="left",
                       left_on="miRNA ID",
                       right_on="miRNA ID")
    to_csv(join_df, fout_full_path)
    logger.info(f"Finish the mirna sequence insertion to {fname}")
Beispiel #15
0
def finalize(fin: str, fout: str):
    df: DataFrame = read_csv(Path(fin))

    logger.info("extract the site")
    df["site"] = df[df["sequence"].notnull()].apply(func=get_wrapper(
        get_subsequence_by_coordinates_no_exception,
        "sequence",
        "start",
        "end",
        extra_chars=SITE_EXTRA_CHARS),
                                                    axis=1)

    def eta(x):
        try:
            return int(x) - SITE_EXTRA_CHARS
        except Exception:
            print(x)
            raise Exception()

    df["start"] = df[df["start"].notnull()]["start"].apply(lambda x: int(
        x) - SITE_EXTRA_CHARS if int(x) > SITE_EXTRA_CHARS else 1)
    df["end"] = df[df["end"].notnull()]["end"].apply(
        lambda x: int(x) + SITE_EXTRA_CHARS)

    logger.info("replace T with U")
    seq_cols = ['miRNA sequence', 'site', 'sequence']
    df[seq_cols] = df[seq_cols].replace(to_replace='T', value='U', regex=True)

    logger.info("Add seed family")
    df["seed_family"] = df['miRNA sequence'].apply(extract_seed_family)

    logger.info("Add valid/invalid flag")
    invalid_conditions = [
        pd.isna(df["miRNA sequence"]),
        pd.isna(df["site"]), df["miRNA sequence"].str.contains('X'),
        df["miRNA sequence"].str.contains('N'), df["site"].str.contains("N"),
        df["site"].str.contains("Error"), df["sequence"].str.contains('N'),
        df["sequence"].str.contains('X'), df["sequence"].str.contains("Error"),
        df["sequence"].str.contains("None")
    ]
    df["valid_row"] = ~reduce((lambda x, y: x | y), invalid_conditions)

    df = df[NORMALIZATION_COLUMNS]
    to_csv(df, Path(fout))
Beispiel #16
0
def qclash_melanoma_mirna_seq_insertion(fname: str):
    logger.info(f"Insert mirna sequence to {fname}")

    fin_full_path = READ_PATH / fname
    fout_full_path = MIRNA_SEQ_PATH / fname

    df: DataFrame = read_csv(fin_full_path)
    mirbase_df: DataFrame = pd.read_csv(
        MIRBASE_FILE, usecols=["miRNA ID", "miRNA sequence", "prefix"])

    hsa = mirbase_df.query("prefix == 'hsa'")
    df["miRNA sequence"] = df.apply(func=get_wrapper(qclash_mirna_func,
                                                     'miRNA ID',
                                                     'mirna_seq_tmp',
                                                     mirbase_hsa=hsa),
                                    axis=1,
                                    result_type="expand")

    to_csv(df, fout_full_path)
    logger.info(f"Finish the mirna sequence insertion to {fname}")
Beispiel #17
0
def concat_blast_result(directory: Path, fname: str,
                        blast_prev_step_file: Path, fout: Path):

    blast_result_list = [
        read_blast_result_file(f) for f in directory.glob(f"*{fname}_*.csv")
    ]
    logger.info("Finish read the files. start to concatenate")
    blast_result_df = pd.concat(blast_result_list, axis=0, ignore_index=True)

    vc = blast_result_df["key"].value_counts()
    blast_result_df["region count"] = \
        blast_result_df.merge(vc, how="left", left_on="key", right_index=True)["key_y"]
    blast_result_inx = blast_result_df["key"].unique()

    all_interactions: DataFrame = read_csv(blast_prev_step_file)
    all_interactions["region"] = "None"
    all_interactions["region count"] = 0
    all_interactions.query("key not in @blast_result_inx", inplace=True)

    unite = pd.concat([blast_result_df, all_interactions], axis=0)
    unite.sort_values(by="key", ignore_index=True, inplace=True)
    unite.drop(columns=["start", "end"], inplace=True, errors="ignore")
    unite = unite.rename(columns={"s.start": "start", "s.end": "end"})
    to_csv(unite, fout)
Beispiel #18
0

"""
Train an auditive som, test it alongside the visual one
"""

somv_path = os.path.join(Constants.DATA_FOLDER,
                         '10classes',
                         'visual_model')

somu_path = os.path.join(Constants.DATA_FOLDER,
                         '10classes',
                         'audio_model')

audio_data_path = os.path.join(Constants.DATA_FOLDER,
                               '10classes',
                               'audio_data.csv')

if __name__ == '__main__':
    xs, ys, filenames = from_csv_with_filenames(audio_data_path)
    vect_size = len(xs[0])
    xs = MinMaxScaler().fit_transform(xs)
    audio_som = SOM(20, 30, vect_size, n_iterations=100,
        checkpoint_dir=somu_path)
    proto = get_prototypes(xs, [int(y) - 1000 for y in ys])
    to_csv(proto.T, ys, os.path.join(Constants.DATA_FOLDER,
                                     '10classes',
                                     'audio_prototypes.csv'))
    audio_som.train(xs)
    iterativeTraining(somv_path, somu_path)
Beispiel #19
0
def save(df: DataFrame, file_name: str):
    full_path = READ_PATH / file_name
    to_csv(df, full_path)