def main(): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) logger.info('downloading genomes from NCBI') session = rfam_session() genomes_query = session.query(Genome).filter( or_(Genome.kingdom == 'archaea', Genome.kingdom == 'bacteria')).filter(Genome.assembly_acc != '') session.close() for genome in genomes_query: download_genome(genome) for genome in genomes_query: pickle_filename = "/home/jovyan/work/data/interim/igr_df_pickles/" + genome.assembly_acc + ".bz2" if not os.path.isfile(pickle_filename): igr_df = extract_igrs(genome) annotated_igr_df = annotate_igrs(genome, igr_df) annotated_igr_df.to_pickle(pickle_filename)
def display_genome(upid): session = rfam_session() genome = session.query(Genome).get(upid) session.close() download_genome(genome) igr_df = extract_igrs(genome, igr_length_cutoff=1) annotated_df = annotate_igrs(genome, igr_df) scatter_plots = graph_genome(annotated_df) layout = graph_layout(genome) fig = go.FigureWidget(data=scatter_plots, layout=layout) return annotated_df, fig, layout, genome
def annotate_igrs(genome, igr_df): """ Annotate the inter-genic regions listed in a dataframe with any available annotations from Rfam Parameters ---------- genome: src.data.rfam_db.Genome The genome object for the organism who's IGR's are being analyzed igr_df: pandas.Dataframe The dataframe with the columns 'accession', 'start', 'end', 'length', 'gc' Returns ------- annotated_igr_df: pandas.Dataframe """ # Initialize connection to Rfam database session = rfam_session() # Get the list of "rfamseq_acc" numbers for a given organism rfamseq_acc_list = session.query(t_genseq.c.rfamseq_acc).filter( t_genseq.c.upid == genome.upid).distinct().all() # Create a list to store all the interval trees annotation_tree_dict = {} for rfamseq_acc in rfamseq_acc_list: # Pull rfamseq_acc out of the list rfamseq_acc = rfamseq_acc[0] rna_query = session.query(t_full_region).filter( t_full_region.c.rfamseq_acc == rfamseq_acc) rna_list = rna_query.all() # Make an interval tree for all of the RNA annotations to allow for rapid overlap search annotation_tree = IntervalTree() # Go though and add each RNA annotation to the interval tree for rna in rna_list: start = min(rna.seq_start, rna.seq_end) end = max(rna.seq_start, rna.seq_end) annotation_interval = Interval(start=start, end=end, chrom=rna.rfamseq_acc, value=rna) annotation_tree.insert_interval(annotation_interval) rfamseq_acc_stripped = rfamseq_acc.partition('.')[0] annotation_tree_dict[rfamseq_acc_stripped] = annotation_tree # Make an empty list of all the igrs with annotations annotated_igr_list = [] for accession, accession_igr_df in igr_df.groupby('accession'): # Lookup the RNA annotation tree for the given accession try: annotation_tree = annotation_tree_dict[accession] except KeyError: print("IGR dataframe key: {} not found. Available keys are: {}". format(accession, annotation_tree_dict.keys())) # For each IGR find all of the overlaps with annotated RNAs for igr in accession_igr_df.itertuples(): overlap_list = annotation_tree.find(igr.start, igr.end) for overlap in overlap_list: # Add the IGR to the annotated_igr_list annotated_igr_list.append({ 'igr_index': igr[0], 'rfam_acc': overlap.value.rfam_acc }) # Convert annotated_igr_list into dataframe and merge on the rfam_acc annotated_igr_df = pd.merge(igr_df, pd.DataFrame(annotated_igr_list, columns=["igr_index", "rfam_acc"]), on="igr_index", how='left') # Look up the information for all of the RNA families represented in this genome rna_family_query = session.query(Family)\ .with_entities(Family.rfam_acc, Family.rfam_id, Family.description, Family.type)\ .filter(Family.rfam_acc.in_(annotated_igr_df["rfam_acc"].dropna().unique())) rna_families_df = pd.read_sql(rna_family_query.statement, rna_family_query.session.bind) merged_igr_df = pd.merge(annotated_igr_df, rna_families_df, on="rfam_acc", how="left") combined_descriptions = merged_igr_df.dropna().groupby("igr_index")\ .agg(dict(rfam_acc=lambda x: ','.join(set(x)), rfam_id=lambda x: ','.join(set(x)), type=lambda x: ','.join(set(x)), description=lambda x: '<br>'.join(set(x)))) merged_igr_df.drop_duplicates(["igr_index"], inplace=True) merged_igr_df.reset_index(inplace=True, drop=True) merged_igr_df.update(combined_descriptions) merged_igr_df["category"] = merged_igr_df.apply( lambda row: categorize_igr(row), axis=1) merged_igr_df["log_length"] = np.log(merged_igr_df["length"]) session.close() return merged_igr_df