def get_site_from_extended_site(fin: str, fout: str):
    def calc_chimera_start(seq: str, subseq: str) -> int:
        try:
            if seq.find(subseq) == -1:
                return -1
            return seq.find(subseq) + 1
        except AttributeError:
            return -1

    def calc_chimera_end(chimera_start: int, seq_extended: str) -> int:
        if chimera_start == -1:
            return -1
        return chimera_start + len(seq_extended) - 1 - HUMAN_SITE_EXTENDED_LEN

    logger.info(f"Insert site to {fin}")
    df: DataFrame = read_csv(Path(fin))
    df["chimera_start"] = df.apply(func=get_wrapper(calc_chimera_start,
                                                    'region sequence',
                                                    'mRNA_seq_extended'),
                                   axis=1)
    df["chimera_end"] = df.apply(func=get_wrapper(calc_chimera_end,
                                                  'chimera_start',
                                                  'mRNA_seq_extended'),
                                 axis=1)

    df["site"] = df.apply(func=get_wrapper(get_subsequence_by_coordinates,
                                           "region sequence",
                                           "chimera_start",
                                           "chimera_end",
                                           extra_chars=SITE_EXTRA_CHARS),
                          axis=1)

    to_csv(df, Path(fout))
    logger.info(f"finish the site sequence insertion to {fin}")
Exemple #2
0
def insert_gambiae_region_sequence(df) -> DataFrame:
    logger.info(f"enter to insert_gambiae_region_sequence")
    df["region_sequence"] = df.apply(func=get_wrapper(
        find_gambiae_region_sequence, "mRNA sequence", "region", "LEN_5UTR",
        "LEN_CDS", "LEN_3UTR"),
                                     axis=1)
    return df
Exemple #3
0
def insert_gambiae_region(df) -> DataFrame:
    logger.info(f"enter to insert_gambiae_region")
    df["region"] = df.apply(func=get_wrapper(find_gambiae_region, "LEN_5UTR",
                                             "LEN_CDS", "LEN_3UTR",
                                             "chimera_start", "chimera_end"),
                            axis=1)
    return df
Exemple #4
0
def human_mapping_merge_by_name(fin: Path, fout: Path):
    def verify_sequence(seq: str, subseq: str) -> bool:
        try:
            return seq.find(subseq) != -1
        except AttributeError:
            return False

    in_df: DataFrame = read_csv(fin)
    in_df["join_key"] = in_df["mRNA ID"].apply(
        lambda x: "|".join(x.split("_")[0:2]))
    mRNA_df = concatenate_biomart_df("human")

    in_df = in_df.merge(mRNA_df,
                        how="left",
                        left_on=["region", "join_key"],
                        right_on=["region", "ID"])

    in_df = in_df.rename(columns={"sequence": "region sequence"})
    in_df = in_df[[
        'key', 'paper name', 'miRNA ID', 'miRNA sequence', 'mRNA ID',
        'mRNA_seq_extended', 'region', 'region_sequence', 'mRNA_start',
        'mRNA_end_extended'
    ]]

    in_df["join_ok"] = in_df.apply(func=get_wrapper(verify_sequence,
                                                    'region sequence',
                                                    'mRNA_seq_extended'),
                                   axis=1)

    to_csv(in_df, fout)
def df_feature_extractor(valid_df: DataFrame) -> DataFrame:
    return apply_in_chunks(df=valid_df,
                           func=get_wrapper(row_feature_extractor,
                                            "miRNA sequence", "site", "start",
                                            "end", "sequence", 'mrna_bulge',
                                            'mrna_inter', 'mir_inter',
                                            'mir_bulge'))
Exemple #6
0
def fast_blast_file(fin: Path, fout: Path, db_title: str):
    logger.info(f"fast blast file {fin} against {db_title}")
    in_df: DataFrame = read_csv(fin)
    seq_file = BIOMART_DATA_PATH / f"{db_title}.csv"
    df_contains_db_title = partial(df_contains, df=pd.read_csv(seq_file))

    in_df["blast sequence"] = in_df.apply(func=get_wrapper(
        df_contains_db_title, "site"),
                                          axis=1)
    to_csv(in_df, fout)
def insert_site_by_coordinates(fin: str, fout: str):
    logger.info(f"Insert site to {fin}")
    df: DataFrame = read_csv(Path(fin))
    df["site"] = df.apply(func=get_wrapper(get_subsequence_by_coordinates,
                                           "mRNA sequence",
                                           "chimera_start",
                                           "chimera_end",
                                           extra_chars=SITE_EXTRA_CHARS),
                          axis=1)

    to_csv(df, Path(fout))
    logger.info(f"finish the site sequence insertion to {fin}")
Exemple #8
0
def blast_file(fin: Path, fout: Path, db_title: str):

    logger.info(f"blast file {fin} against {db_title}")
    in_df: DataFrame = read_csv(fin)
    blastn_df: DataFrame = in_df.apply(func=get_wrapper(run_blastn,
                                                        "site",
                                                        db_title=db_title),
                                       axis=1)
    result = pd.concat([in_df, blastn_df], axis=1)

    # in_df["blast region"] = in_df["blast sequence"].apply(lambda x: "" if np.isnan(x) else db_title)

    to_csv(result, fout)
Exemple #9
0
def duplex(method: str, fin: str, fout: str):
    duplex_cls: Duplex = DUPLEX_DICT[method]
    logger.info(f"{method} do_duplex to {fin}")
    in_df: DataFrame = read_csv(Path(fin))
    # [in_df["miRNA sequence"].notnull() & in_df.site.notnull()]
    duplex_df = in_df.query("valid_row").apply(func=get_wrapper(
        do_duplex, "miRNA sequence", "site", cls=duplex_cls),
        axis=1)


    result = pd.merge(left=in_df, right=duplex_df, left_index=True, right_index=True, how='left')

    result["duplex_method"] = method
    to_csv(result, Path(fout))
Exemple #10
0
def insert_site_from_chromosome(fin: str, fout: str, chr_dir: str):
    logger.info(f"Insert site from chromosome to {fin}")
    df: DataFrame = read_csv(Path(fin))
    df["site"] = df.apply(func=get_wrapper(extract_seq_from_chromosome,
                                           'chr',
                                           'start',
                                           'end',
                                           'strand',
                                           directory=Path(chr_dir)),
                          axis=1)
    df["site"] = df["site"].apply(lambda x: x.upper())

    to_csv(df, Path(fout))
    logger.info(f"finish the site sequence insertion to {fin}")
Exemple #11
0
def finalize(fin: str, fout: str):
    df: DataFrame = read_csv(Path(fin))

    logger.info("extract the site")
    df["site"] = df[df["sequence"].notnull()].apply(func=get_wrapper(
        get_subsequence_by_coordinates_no_exception,
        "sequence",
        "start",
        "end",
        extra_chars=SITE_EXTRA_CHARS),
                                                    axis=1)

    def eta(x):
        try:
            return int(x) - SITE_EXTRA_CHARS
        except Exception:
            print(x)
            raise Exception()

    df["start"] = df[df["start"].notnull()]["start"].apply(lambda x: int(
        x) - SITE_EXTRA_CHARS if int(x) > SITE_EXTRA_CHARS else 1)
    df["end"] = df[df["end"].notnull()]["end"].apply(
        lambda x: int(x) + SITE_EXTRA_CHARS)

    logger.info("replace T with U")
    seq_cols = ['miRNA sequence', 'site', 'sequence']
    df[seq_cols] = df[seq_cols].replace(to_replace='T', value='U', regex=True)

    logger.info("Add seed family")
    df["seed_family"] = df['miRNA sequence'].apply(extract_seed_family)

    logger.info("Add valid/invalid flag")
    invalid_conditions = [
        pd.isna(df["miRNA sequence"]),
        pd.isna(df["site"]), df["miRNA sequence"].str.contains('X'),
        df["miRNA sequence"].str.contains('N'), df["site"].str.contains("N"),
        df["site"].str.contains("Error"), df["sequence"].str.contains('N'),
        df["sequence"].str.contains('X'), df["sequence"].str.contains("Error"),
        df["sequence"].str.contains("None")
    ]
    df["valid_row"] = ~reduce((lambda x, y: x | y), invalid_conditions)

    df = df[NORMALIZATION_COLUMNS]
    to_csv(df, Path(fout))
Exemple #12
0
def qclash_melanoma_mirna_seq_insertion(fname: str):
    logger.info(f"Insert mirna sequence to {fname}")

    fin_full_path = READ_PATH / fname
    fout_full_path = MIRNA_SEQ_PATH / fname

    df: DataFrame = read_csv(fin_full_path)
    mirbase_df: DataFrame = pd.read_csv(
        MIRBASE_FILE, usecols=["miRNA ID", "miRNA sequence", "prefix"])

    hsa = mirbase_df.query("prefix == 'hsa'")
    df["miRNA sequence"] = df.apply(func=get_wrapper(qclash_mirna_func,
                                                     'miRNA ID',
                                                     'mirna_seq_tmp',
                                                     mirbase_hsa=hsa),
                                    axis=1,
                                    result_type="expand")

    to_csv(df, fout_full_path)
    logger.info(f"Finish the mirna sequence insertion to {fname}")
Exemple #13
0
def run(out_filename: str):
    df = read()
    df[["mirna_seq_tmp", "site"]] = df.apply(func=get_wrapper(
        chimera_split,
        'ReadSequences',
        'Read_start_5',
        'Read_end_5',
        'Read_start_3',
        'Read_end_3',
    ),
                                             axis=1,
                                             result_type="expand")

    df[["mirna_seq_tmp", "site"]] = df[["mirna_seq_tmp",
                                        "site"]].replace(to_replace='T',
                                                         value='U',
                                                         regex=True)

    df = change_columns_names(df)
    df = add_meta_data(df)
    save(df, out_filename)
Exemple #14
0
def run_blastn(seq: str, db_title: str) -> Series:
    def blast_coverage(start: int, end: int, query: str) -> float:
        return (end - start + 1.0) * 100 / len(query)

    RETURN_COL = [
        "Gene_ID", "sequence", "identity", "coverage", "s.start", "s.end"
    ]
    try:
        if len(seq) < MINIMAL_LENGTH_TO_BLAST:
            return pd.Series(index=RETURN_COL)
    except TypeError:
        return pd.Series(index=RETURN_COL)

    with NamedTemporaryFile(prefix="blast") as blast_out_file:
        with NamedTemporaryFile(prefix="blast") as seq_to_find_file:
            record = SeqRecord(Seq(seq), description="seq_to_find")

            SeqIO.write(record, seq_to_find_file.name, "fasta")

            cline = NcbiblastnCommandline(query=str(seq_to_find_file.name),
                                          db=db_title,
                                          evalue=1,
                                          strand="plus",
                                          task="blastn-short",
                                          out=str(blast_out_file.name),
                                          outfmt=6)

            call_wrapper(cmd=str(cline), cwd=BIOMART_BLAST_PATH)

        #Parse the output file
        colnames = [
            'query acc.ver', 'subject acc.ver', '%identity',
            'alignment length', 'mismatches', 'gap opens', 'q.start', 'q.end',
            's.start', 's.end', 'evalue', 'bit score'
        ]
        result = pd.read_csv(blast_out_file.name,
                             sep='\t',
                             names=colnames,
                             header=None)
        result.rename(columns={
            '%identity': "identity",
            'alignment length': 'alignment_length',
            "gap opens": "gap_opens"
        },
                      inplace=True)

        try:
            result["coverage"] = result.apply(func=get_wrapper(blast_coverage,
                                                               's.start',
                                                               's.end',
                                                               query=seq),
                                              axis=1)
        except ValueError:
            # Empty result dataframe
            assert result.shape[0] == 0, "Wrong exception. have to check"
            return pd.Series(index=RETURN_COL)

        # Consider the full match rows only
        result.query(
            "identity >= @MINIMAL_BLAST_IDENTITY and "
            "coverage >= @MINIMAL_BLAST_COVERAGE and "
            "gap_opens == 0",
            inplace=True)
        result.reset_index(inplace=True)

        if result.shape[0] == 0:
            return pd.Series(index=RETURN_COL)

        # get the full sequence
        transcripts: DataFrame = pd.read_csv(BIOMART_DATA_PATH /
                                             f"{db_title}.csv")
        result = result.merge(transcripts,
                              how="left",
                              left_on="subject acc.ver",
                              right_on="ID")

        # choose the row with longest utr and add the full mrna
        ###############################
        best = result.iloc[result["sequence length"].idxmax()]
        best.rename({'ID': "Gene_ID"}, inplace=True)

        return best[RETURN_COL]