def get_site_from_extended_site(fin: str, fout: str): def calc_chimera_start(seq: str, subseq: str) -> int: try: if seq.find(subseq) == -1: return -1 return seq.find(subseq) + 1 except AttributeError: return -1 def calc_chimera_end(chimera_start: int, seq_extended: str) -> int: if chimera_start == -1: return -1 return chimera_start + len(seq_extended) - 1 - HUMAN_SITE_EXTENDED_LEN logger.info(f"Insert site to {fin}") df: DataFrame = read_csv(Path(fin)) df["chimera_start"] = df.apply(func=get_wrapper(calc_chimera_start, 'region sequence', 'mRNA_seq_extended'), axis=1) df["chimera_end"] = df.apply(func=get_wrapper(calc_chimera_end, 'chimera_start', 'mRNA_seq_extended'), axis=1) df["site"] = df.apply(func=get_wrapper(get_subsequence_by_coordinates, "region sequence", "chimera_start", "chimera_end", extra_chars=SITE_EXTRA_CHARS), axis=1) to_csv(df, Path(fout)) logger.info(f"finish the site sequence insertion to {fin}")
def insert_gambiae_region_sequence(df) -> DataFrame: logger.info(f"enter to insert_gambiae_region_sequence") df["region_sequence"] = df.apply(func=get_wrapper( find_gambiae_region_sequence, "mRNA sequence", "region", "LEN_5UTR", "LEN_CDS", "LEN_3UTR"), axis=1) return df
def insert_gambiae_region(df) -> DataFrame: logger.info(f"enter to insert_gambiae_region") df["region"] = df.apply(func=get_wrapper(find_gambiae_region, "LEN_5UTR", "LEN_CDS", "LEN_3UTR", "chimera_start", "chimera_end"), axis=1) return df
def human_mapping_merge_by_name(fin: Path, fout: Path): def verify_sequence(seq: str, subseq: str) -> bool: try: return seq.find(subseq) != -1 except AttributeError: return False in_df: DataFrame = read_csv(fin) in_df["join_key"] = in_df["mRNA ID"].apply( lambda x: "|".join(x.split("_")[0:2])) mRNA_df = concatenate_biomart_df("human") in_df = in_df.merge(mRNA_df, how="left", left_on=["region", "join_key"], right_on=["region", "ID"]) in_df = in_df.rename(columns={"sequence": "region sequence"}) in_df = in_df[[ 'key', 'paper name', 'miRNA ID', 'miRNA sequence', 'mRNA ID', 'mRNA_seq_extended', 'region', 'region_sequence', 'mRNA_start', 'mRNA_end_extended' ]] in_df["join_ok"] = in_df.apply(func=get_wrapper(verify_sequence, 'region sequence', 'mRNA_seq_extended'), axis=1) to_csv(in_df, fout)
def df_feature_extractor(valid_df: DataFrame) -> DataFrame: return apply_in_chunks(df=valid_df, func=get_wrapper(row_feature_extractor, "miRNA sequence", "site", "start", "end", "sequence", 'mrna_bulge', 'mrna_inter', 'mir_inter', 'mir_bulge'))
def fast_blast_file(fin: Path, fout: Path, db_title: str): logger.info(f"fast blast file {fin} against {db_title}") in_df: DataFrame = read_csv(fin) seq_file = BIOMART_DATA_PATH / f"{db_title}.csv" df_contains_db_title = partial(df_contains, df=pd.read_csv(seq_file)) in_df["blast sequence"] = in_df.apply(func=get_wrapper( df_contains_db_title, "site"), axis=1) to_csv(in_df, fout)
def insert_site_by_coordinates(fin: str, fout: str): logger.info(f"Insert site to {fin}") df: DataFrame = read_csv(Path(fin)) df["site"] = df.apply(func=get_wrapper(get_subsequence_by_coordinates, "mRNA sequence", "chimera_start", "chimera_end", extra_chars=SITE_EXTRA_CHARS), axis=1) to_csv(df, Path(fout)) logger.info(f"finish the site sequence insertion to {fin}")
def blast_file(fin: Path, fout: Path, db_title: str): logger.info(f"blast file {fin} against {db_title}") in_df: DataFrame = read_csv(fin) blastn_df: DataFrame = in_df.apply(func=get_wrapper(run_blastn, "site", db_title=db_title), axis=1) result = pd.concat([in_df, blastn_df], axis=1) # in_df["blast region"] = in_df["blast sequence"].apply(lambda x: "" if np.isnan(x) else db_title) to_csv(result, fout)
def duplex(method: str, fin: str, fout: str): duplex_cls: Duplex = DUPLEX_DICT[method] logger.info(f"{method} do_duplex to {fin}") in_df: DataFrame = read_csv(Path(fin)) # [in_df["miRNA sequence"].notnull() & in_df.site.notnull()] duplex_df = in_df.query("valid_row").apply(func=get_wrapper( do_duplex, "miRNA sequence", "site", cls=duplex_cls), axis=1) result = pd.merge(left=in_df, right=duplex_df, left_index=True, right_index=True, how='left') result["duplex_method"] = method to_csv(result, Path(fout))
def insert_site_from_chromosome(fin: str, fout: str, chr_dir: str): logger.info(f"Insert site from chromosome to {fin}") df: DataFrame = read_csv(Path(fin)) df["site"] = df.apply(func=get_wrapper(extract_seq_from_chromosome, 'chr', 'start', 'end', 'strand', directory=Path(chr_dir)), axis=1) df["site"] = df["site"].apply(lambda x: x.upper()) to_csv(df, Path(fout)) logger.info(f"finish the site sequence insertion to {fin}")
def finalize(fin: str, fout: str): df: DataFrame = read_csv(Path(fin)) logger.info("extract the site") df["site"] = df[df["sequence"].notnull()].apply(func=get_wrapper( get_subsequence_by_coordinates_no_exception, "sequence", "start", "end", extra_chars=SITE_EXTRA_CHARS), axis=1) def eta(x): try: return int(x) - SITE_EXTRA_CHARS except Exception: print(x) raise Exception() df["start"] = df[df["start"].notnull()]["start"].apply(lambda x: int( x) - SITE_EXTRA_CHARS if int(x) > SITE_EXTRA_CHARS else 1) df["end"] = df[df["end"].notnull()]["end"].apply( lambda x: int(x) + SITE_EXTRA_CHARS) logger.info("replace T with U") seq_cols = ['miRNA sequence', 'site', 'sequence'] df[seq_cols] = df[seq_cols].replace(to_replace='T', value='U', regex=True) logger.info("Add seed family") df["seed_family"] = df['miRNA sequence'].apply(extract_seed_family) logger.info("Add valid/invalid flag") invalid_conditions = [ pd.isna(df["miRNA sequence"]), pd.isna(df["site"]), df["miRNA sequence"].str.contains('X'), df["miRNA sequence"].str.contains('N'), df["site"].str.contains("N"), df["site"].str.contains("Error"), df["sequence"].str.contains('N'), df["sequence"].str.contains('X'), df["sequence"].str.contains("Error"), df["sequence"].str.contains("None") ] df["valid_row"] = ~reduce((lambda x, y: x | y), invalid_conditions) df = df[NORMALIZATION_COLUMNS] to_csv(df, Path(fout))
def qclash_melanoma_mirna_seq_insertion(fname: str): logger.info(f"Insert mirna sequence to {fname}") fin_full_path = READ_PATH / fname fout_full_path = MIRNA_SEQ_PATH / fname df: DataFrame = read_csv(fin_full_path) mirbase_df: DataFrame = pd.read_csv( MIRBASE_FILE, usecols=["miRNA ID", "miRNA sequence", "prefix"]) hsa = mirbase_df.query("prefix == 'hsa'") df["miRNA sequence"] = df.apply(func=get_wrapper(qclash_mirna_func, 'miRNA ID', 'mirna_seq_tmp', mirbase_hsa=hsa), axis=1, result_type="expand") to_csv(df, fout_full_path) logger.info(f"Finish the mirna sequence insertion to {fname}")
def run(out_filename: str): df = read() df[["mirna_seq_tmp", "site"]] = df.apply(func=get_wrapper( chimera_split, 'ReadSequences', 'Read_start_5', 'Read_end_5', 'Read_start_3', 'Read_end_3', ), axis=1, result_type="expand") df[["mirna_seq_tmp", "site"]] = df[["mirna_seq_tmp", "site"]].replace(to_replace='T', value='U', regex=True) df = change_columns_names(df) df = add_meta_data(df) save(df, out_filename)
def run_blastn(seq: str, db_title: str) -> Series: def blast_coverage(start: int, end: int, query: str) -> float: return (end - start + 1.0) * 100 / len(query) RETURN_COL = [ "Gene_ID", "sequence", "identity", "coverage", "s.start", "s.end" ] try: if len(seq) < MINIMAL_LENGTH_TO_BLAST: return pd.Series(index=RETURN_COL) except TypeError: return pd.Series(index=RETURN_COL) with NamedTemporaryFile(prefix="blast") as blast_out_file: with NamedTemporaryFile(prefix="blast") as seq_to_find_file: record = SeqRecord(Seq(seq), description="seq_to_find") SeqIO.write(record, seq_to_find_file.name, "fasta") cline = NcbiblastnCommandline(query=str(seq_to_find_file.name), db=db_title, evalue=1, strand="plus", task="blastn-short", out=str(blast_out_file.name), outfmt=6) call_wrapper(cmd=str(cline), cwd=BIOMART_BLAST_PATH) #Parse the output file colnames = [ 'query acc.ver', 'subject acc.ver', '%identity', 'alignment length', 'mismatches', 'gap opens', 'q.start', 'q.end', 's.start', 's.end', 'evalue', 'bit score' ] result = pd.read_csv(blast_out_file.name, sep='\t', names=colnames, header=None) result.rename(columns={ '%identity': "identity", 'alignment length': 'alignment_length', "gap opens": "gap_opens" }, inplace=True) try: result["coverage"] = result.apply(func=get_wrapper(blast_coverage, 's.start', 's.end', query=seq), axis=1) except ValueError: # Empty result dataframe assert result.shape[0] == 0, "Wrong exception. have to check" return pd.Series(index=RETURN_COL) # Consider the full match rows only result.query( "identity >= @MINIMAL_BLAST_IDENTITY and " "coverage >= @MINIMAL_BLAST_COVERAGE and " "gap_opens == 0", inplace=True) result.reset_index(inplace=True) if result.shape[0] == 0: return pd.Series(index=RETURN_COL) # get the full sequence transcripts: DataFrame = pd.read_csv(BIOMART_DATA_PATH / f"{db_title}.csv") result = result.merge(transcripts, how="left", left_on="subject acc.ver", right_on="ID") # choose the row with longest utr and add the full mrna ############################### best = result.iloc[result["sequence length"].idxmax()] best.rename({'ID': "Gene_ID"}, inplace=True) return best[RETURN_COL]