def main(): sampletable = (download_geo_metadata().pipe(update_fbgns).pipe( update_gene_symbols).pipe(reorder_columns)) out_file = PROJECT_DIR / "timecourse-aln-wf/config/sampletable.tsv" logger.info(f"Writing out sample table: {out_file}") sampletable.to_csv(out_file, sep="\t", index=False)
def download_geo_metadata() -> pd.DataFrame: logger.info("Querying GEO for {}".format(GSE)) tmpDir = TemporaryDirectory() gse = GEOparse.get_GEO(GSE, destdir=tmpDir.name, silent=True) ## Pull out sample attributes and build data frame attributes = [] for gsm, dat in gse.gsms.items(): try: # Parse sample title attrs = re.match( r"^DRSC_(?P<plate_id>plate\d)_(?P<time_point>day\d)_(?P<plate_row>[A-H])(?P<plate_column>\d+)_(?P<drsc>DRSC(\d+|NA))_(?P<fbgn>(FBgn\d+|NA))_(?P<symbol>.*?)$", dat.metadata["title"][0], ).groupdict() attrs["GEO"] = gsm # Get SRX accession from linkout for x in dat.metadata["relation"]: match = re.match(r"(\w+):.*[\/=](\w+\d+)$", x) if match: k, v = match.groups() attrs[k] = v # Expand out SRR accessions for srr in get_srrs(attrs["SRA"]): attrs["samplename"] = srr attrs["Run"] = srr attributes.append(attrs) except AttributeError: print(gsm, dat.metadata["title"]) df = pd.DataFrame(attributes).rename(columns={ "SRA": "SRX" }).set_index(["samplename", "SRX"]) return df
def get_current_flybase_annotations(): # Import config set up references logger.info("Loading config: {}".format(CONFIG)) with open(CONFIG) as fh: config = yaml.load(fh) assembly = config["assembly"] tag = config["aligner"]["tag"] REF = os.path.join(os.environ["REFERENCES_DIR"], assembly, tag) # load flybase annotations FB_ANNO = os.path.join(REF, "fb_annotation/dmel_{}.fb_annotation".format(tag)) logger.info("Loading FlyBase annotation file: {}".format(FB_ANNO)) fb = pd.read_table(FB_ANNO)[[ "primary_FBgn", "gene_symbol", "secondary_FBgn" ]] fb.rename(columns={ "primary_FBgn": "FBgn", "gene_symbol": "symbol" }, inplace=True) ## Make map of old fbgn to current fbgn and current fbgn to current symbol fbgns = {} genes = {} for i, record in fb.iterrows(): fbgn = record.FBgn symbol = record.symbol fbgn2 = record.secondary_FBgn fbgns[fbgn] = fbgn genes[fbgn] = symbol if isinstance(fbgn2, str): for f2 in fbgn2.strip().split(","): fbgns[f2] = fbgn
from s2rnai.logger import logger def srr_iter(srx): """Create generator to return SRR given an SRX.""" res = Entrez.efetch(db='sra', id=srx) xml = res.read() root = ElementTree.fromstring(xml) for run in root.iter('RUN'): yield run.get('accession') res.close() # Import config set up references logger.info('Loading config: {}'.format(CONFIG)) with open(CONFIG) as fh: config = yaml.load(fh) assembly = config['assembly'] tag = config['aligner']['tag'] REF = os.path.join(os.environ['REFERENCES_DIR'], assembly, tag) # load flybase annotations FB_ANNO = os.path.join(REF, 'fb_annotation/dmel_{}.fb_annotation'.format(tag)) logger.info('Loading FlyBase annotation file: {}'.format(FB_ANNO)) fb = pd.read_table(FB_ANNO)[['primary_FBgn', 'gene_symbol', 'secondary_FBgn']] fb.rename( columns={ 'primary_FBgn': 'FBgn', 'gene_symbol': 'symbol'
if __name__ == '__main__': # Import config and set up references with open('../config/config.yml') as fh: config = yaml.load(fh) assembly = config['assembly'] tag = config['aligner']['tag'] REF = os.path.join(os.environ['REFERENCES_DIR'], assembly, tag) # Download gff file from FlyBase url = config['references'][config['assembly']][ config['gtf']['tag']]['gtf']['url'].replace('gtf', 'gff') fname = os.path.join('../data/external/FlyBase', os.path.basename(url)) if not os.path.exists(fname): logger.info('Createing FlyBase directory.') os.makedirs(os.path.dirname(fname), exist_ok=True) with open(fname, 'wb') as fh: logger.info('Downloading GFF') response = urllib.request.urlretrieve(url, fname) # Output filename to store drsc gff info drsc_fname = fname.replace('.gff.gz', '.drsc.gff') # use grep to filter our DRSC, this is the fastest way if not os.path.exists(drsc_fname): logger.info('Filtering GFF') cmd = 'gunzip -c {fname} | grep "DRSC_dsRNA" | grep "RNAi_reagent" > {drsc_fname}'.format( fname=fname, drsc_fname=drsc_fname) subprocess.run(cmd, shell=True)