Exemple #1
0
def main():
    sampletable = (download_geo_metadata().pipe(update_fbgns).pipe(
        update_gene_symbols).pipe(reorder_columns))

    out_file = PROJECT_DIR / "timecourse-aln-wf/config/sampletable.tsv"
    logger.info(f"Writing out sample table: {out_file}")
    sampletable.to_csv(out_file, sep="\t", index=False)
Exemple #2
0
def download_geo_metadata() -> pd.DataFrame:
    logger.info("Querying GEO for {}".format(GSE))
    tmpDir = TemporaryDirectory()
    gse = GEOparse.get_GEO(GSE, destdir=tmpDir.name, silent=True)

    ## Pull out sample attributes and build data frame
    attributes = []
    for gsm, dat in gse.gsms.items():
        try:
            # Parse sample title
            attrs = re.match(
                r"^DRSC_(?P<plate_id>plate\d)_(?P<time_point>day\d)_(?P<plate_row>[A-H])(?P<plate_column>\d+)_(?P<drsc>DRSC(\d+|NA))_(?P<fbgn>(FBgn\d+|NA))_(?P<symbol>.*?)$",
                dat.metadata["title"][0],
            ).groupdict()

            attrs["GEO"] = gsm

            # Get SRX accession from linkout
            for x in dat.metadata["relation"]:
                match = re.match(r"(\w+):.*[\/=](\w+\d+)$", x)
                if match:
                    k, v = match.groups()
                    attrs[k] = v

            # Expand out SRR accessions
            for srr in get_srrs(attrs["SRA"]):
                attrs["samplename"] = srr
                attrs["Run"] = srr
                attributes.append(attrs)

        except AttributeError:
            print(gsm, dat.metadata["title"])

    df = pd.DataFrame(attributes).rename(columns={
        "SRA": "SRX"
    }).set_index(["samplename", "SRX"])
    return df
Exemple #3
0
def get_current_flybase_annotations():
    # Import config set up references
    logger.info("Loading config: {}".format(CONFIG))
    with open(CONFIG) as fh:
        config = yaml.load(fh)

    assembly = config["assembly"]
    tag = config["aligner"]["tag"]
    REF = os.path.join(os.environ["REFERENCES_DIR"], assembly, tag)

    # load flybase annotations
    FB_ANNO = os.path.join(REF,
                           "fb_annotation/dmel_{}.fb_annotation".format(tag))
    logger.info("Loading FlyBase annotation file: {}".format(FB_ANNO))
    fb = pd.read_table(FB_ANNO)[[
        "primary_FBgn", "gene_symbol", "secondary_FBgn"
    ]]
    fb.rename(columns={
        "primary_FBgn": "FBgn",
        "gene_symbol": "symbol"
    },
              inplace=True)

    ## Make map of old fbgn to current fbgn and current fbgn to current symbol
    fbgns = {}
    genes = {}
    for i, record in fb.iterrows():
        fbgn = record.FBgn
        symbol = record.symbol
        fbgn2 = record.secondary_FBgn

        fbgns[fbgn] = fbgn
        genes[fbgn] = symbol

        if isinstance(fbgn2, str):
            for f2 in fbgn2.strip().split(","):
                fbgns[f2] = fbgn
Exemple #4
0
from s2rnai.logger import logger

def srr_iter(srx):
    """Create generator to return SRR given an SRX."""

    res = Entrez.efetch(db='sra', id=srx)
    xml = res.read()
    root = ElementTree.fromstring(xml)
    for run in root.iter('RUN'):
        yield run.get('accession')

    res.close()


# Import config set up references
logger.info('Loading config: {}'.format(CONFIG))
with open(CONFIG) as fh:
    config = yaml.load(fh)

assembly = config['assembly']
tag = config['aligner']['tag']
REF = os.path.join(os.environ['REFERENCES_DIR'], assembly, tag)

# load flybase annotations
FB_ANNO = os.path.join(REF, 'fb_annotation/dmel_{}.fb_annotation'.format(tag))
logger.info('Loading FlyBase annotation file: {}'.format(FB_ANNO))
fb = pd.read_table(FB_ANNO)[['primary_FBgn', 'gene_symbol', 'secondary_FBgn']]
fb.rename(
    columns={
       'primary_FBgn': 'FBgn',
       'gene_symbol': 'symbol'
Exemple #5
0
if __name__ == '__main__':
    # Import config and set up references
    with open('../config/config.yml') as fh:
        config = yaml.load(fh)

    assembly = config['assembly']
    tag = config['aligner']['tag']
    REF = os.path.join(os.environ['REFERENCES_DIR'], assembly, tag)

    # Download gff file from FlyBase
    url = config['references'][config['assembly']][
        config['gtf']['tag']]['gtf']['url'].replace('gtf', 'gff')
    fname = os.path.join('../data/external/FlyBase', os.path.basename(url))

    if not os.path.exists(fname):
        logger.info('Createing FlyBase directory.')
        os.makedirs(os.path.dirname(fname), exist_ok=True)
        with open(fname, 'wb') as fh:
            logger.info('Downloading GFF')
            response = urllib.request.urlretrieve(url, fname)

    # Output filename to store drsc gff info
    drsc_fname = fname.replace('.gff.gz', '.drsc.gff')

    # use grep to filter our DRSC, this is the fastest way
    if not os.path.exists(drsc_fname):
        logger.info('Filtering GFF')
        cmd = 'gunzip -c {fname} | grep "DRSC_dsRNA" | grep "RNAi_reagent" > {drsc_fname}'.format(
            fname=fname, drsc_fname=drsc_fname)
        subprocess.run(cmd, shell=True)