Exemple #1
0
def loadSummarizedContextStats(infiles,
                               outfile,
                               suffix=".contextstats.tsv.gz"):
    """merge output from :func:`summarizeTagsWithinContex` and load into database.

    Arguments
    ---------
    infiles : list
        List of filenames in :term:`tsv` format. The files should end
        in suffix.
    outfile : string
        Output filename, the table name is derived from `outfile`.
    suffix : string
        Suffix to remove from filename for track name.

    """

    header = ",".join([P.snip(os.path.basename(x), suffix) for x in infiles])
    filenames = " ".join(infiles)

    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--add-index=track")

    statement = """cgat combine_tables
    --header-names=%(header)s
    --missing-value=0
    --skip-titles
    %(filenames)s
    | perl -p -e "s/bin/track/; s/\?/Q/g"
    | cgat table2table --transpose
    | %(load_statement)s
    > %(outfile)s
    """
    P.run(statement)
Exemple #2
0
def loadRepeats(infile, outfile):
    """load genomic locations of repeats into database.

    This method loads the genomic coordinates (contig, start, end)
    and the repeat name into the database.

    Arguments
    ---------
    infile : string
        Input filename in :term:`gff` with repeat annotations.
    outfile : string
        Output filename with logging information. The table name is
        derived from outfile.

    """
    load_statement = P.build_load_statement(
        P.to_table(outfile),
        options="--add-index=class "
        "--header-names=contig,start,stop,class")

    statement = """zcat %(infile)s
    | cgat gff2bed --set-name=class
    | grep -v "#"
    | cut -f1,2,3,4
    | %(load_statement)s
    > %(outfile)s"""
    P.run(statement, job_memory=PARAMS["job_memory"])
Exemple #3
0
def loadGO(infile, outfile, tablename):
    """import GO results into individual tables.

    This method concatenates all the results from
    a GO analysis and uploads into a single table.

    """

    indir = infile + ".dir"

    if not os.path.exists(indir):
        iotools.touch_file(outfile)
        return

    load_statement = P.build_load_statement(tablename=tablename,
                                            options="--allow-empty-file "
                                            "--add-index=category "
                                            "--add-index=goid ")

    statement = '''
    python %(toolsdir)s/cat_tables.py %(indir)s/*.overall
    | %(load_statement)s
    > %(outfile)s
    '''
    P.run(statement)
Exemple #4
0
def loadGeneInformation(infile,
                        outfile,
                        only_proteincoding=False,
                        job_memory="4G"):
    '''load gene-related attributes from :term:`gtf` file into database.
    This method takes transcript-associated features from an
    :term:`gtf` file and collects the gene-related attributes in the
    9th column of the gtf file, ignoring exon_id, transcript_id,
    transcript_name, protein_id and exon_number.
    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Output filename, contains logging information. The
       table name is derived from the filename of outfile.
    only_proteincoding : bool
       If True, only consider protein coding genes.
    '''

    table = P.to_table(outfile)

    if only_proteincoding:
        filter_cmd = """cgat gtf2gtf
        --method=filter --filter-method=proteincoding""" % PARAMS
    else:
        filter_cmd = "cat"

    load_statement = P.build_load_statement(table,
                                            options="--add-index=gene_id "
                                            "--add-index=gene_name "
                                            "--map=gene_name:str")

    statement = '''
    zcat %(infile)s
    | %(filter_cmd)s
    | grep "transcript_id"
    | cgat gtf2gtf
    --method=sort --sort-order=gene+transcript
    | cgat gtf2tsv
    --attributes-as-columns --output-only-attributes -v 0
    | cgat csv-cut
    --remove exon_id transcript_id transcript_name protein_id exon_number
    | (read h; echo "$h"; sort )
    | uniq
    | %(load_statement)s
    > %(outfile)s'''

    P.run(statement, job_memory=job_memory)
Exemple #5
0
def loadMotifSequenceComposition(infile, outfile):
    '''compute sequence composition of sequences used for ab-initio search.'''

    load_statement = P.build_load_statement(
        P.to_table(outfile))

    statement = '''
    cgat fasta2table
        --section=na
        --log=%(outfile)s
    < %(infile)s
    | %(load_statement)s
    > %(outfile)s'''

    P.run(statement)
Exemple #6
0
def loadFimo(infile, outfile):

    to_cluster = True

    tablename = os.path.basename(outfile).replace(".load",
                                                  "").replace(".", "_")
    options = '-H "pattern_name,sequence_name,start,stop,strand,score,p_value,q_value,matched_sequence" '

    statement = []
    statement.append(f'''cat {infile} | ''')
    statement.append(
        P.build_load_statement(tablename, options=options, retry=True))
    statement.append(f''' > {outfile}''')
    statement = ' '.join(statement)

    P.run(statement)
Exemple #7
0
def loadPolyphen(infile, outfile):
    '''load polyphen results.'''

    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=snp_id "
        "--add-index=protein_id "
        "--map=effect:str")

    statement = '''
    gunzip
    < %(infile)s
    | perl -p -e "s/o_acc/protein_id/; s/ +//g; s/^#//;"
    | %(load_statement)s
    > %(outfile)s
    '''
    P.run()
Exemple #8
0
def loadDremeTomTom(infile, outfile):
    '''load dreme tomtom results'''

    to_cluster = True

    tablename = os.path.basename(outfile).replace(".load",
                                                  "").replace(".", "_")
    options = '-H "query_id,target_id,optimal_offset,p_value,e_value,q_value,overlap,query_consensus,targe_consensus,orientation" '

    statement = []
    statement.append('''cat %(infile)s | ''')
    statement.append(
        P.build_load_statement(tablename, options=options, retry=True))
    statement.append(''' > %(outfile)s''')
    statement = ' '.join(statement)

    P.run(statement)
Exemple #9
0
def loadPeaks(infile, outfile):
    '''Load input peaks to merge w/ MAST results by peak_id'''

    tablename = os.path.basename(outfile).replace(".load",
                                                  "").replace(".", "_")
    options = '-H "contig,start,end,peak_id,score" '

    statement = []
    statement.append(f'''cat {infile} | cut -f1-5 - |''')
    statement.append(
        P.build_load_statement(tablename, options=options, retry=True))
    statement.append(f''' > {outfile}''')
    statement = ' '.join(statement)

    to_cluster = True

    P.run(statement)
Exemple #10
0
def loadMemeTomTom(infile, outfile):
    '''load meme tomtom results'''

    # cgat default is not to submit P.load() jobs to cluster
    # this is hardcoded in P.load()

    tablename = os.path.basename(outfile).replace(".load",
                                                  "").replace(".", "_")
    options = '-H "query_id,target_id,optimal_offset,p_value,e_value,q_value,overlap,query_consensus,targe_consensus,orientation" '

    statement = []
    statement.append('''cat %(infile)s | ''')
    statement.append(
        P.build_load_statement(tablename, options=options, retry=True))
    statement.append(''' > %(outfile)s''')
    statement = ' '.join(statement)

    to_cluster = True

    P.run(statement)
Exemple #11
0
def loadMast(infile, outfile):
    '''MAST results section1 contains high scoring sequences (for input motifs), 
       ranked by increasing e-value (up to a max of 10)'''

    tmp_dir = "$SCRATCH_DIR"
    tablename = "MAST_" + outfile.split("/")[-2].replace(".", "_")
    options = '-H "peak_id,e_value,length" '

    statement = []
    statement.append(f'''tmp=`mktemp -p {tmp_dir}` &&
                         sed -n '/^SECTION I:/,/^SECTION II:/p' {infile} | 
                           grep "^[a-zA-Z0-9].*[0-9]$" - | 
                           tr -s "[[:blank:]]" "\\t" 
                           > $tmp &&
                         cat $tmp ''')
    statement.append(
        P.build_load_statement(tablename, options=options, retry=True))
    statement = ' | '.join(statement) + f''' > {outfile} && rm $tmp'''

    P.run(statement)
Exemple #12
0
def loadmiRNATranscripts(infile, outfile):
    '''load transcripts from a GFF3 file into the database.

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gff3` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''
    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--allow-empty-file "
                                            "--header-names=feature,Name")

    statement = '''
     export LANG=en_GB.UTF-8 && zcat %(infile)s
    | cgat gtf2tsv --is-gff3 --attributes-as-columns 2> /dev/null
    | grep -v "#"
    | cut -f3,12
    |%(load_statement)s
    > %(outfile)s'''
    P.run(statement, job_memory=PARAMS["job_memory"])
Exemple #13
0
def getMotifIDs(infile, outfile):
    '''Create motif_ID:TF lookup table'''

    tmp_dir = "$SCRATCH_DIR"

    tablename = os.path.basename(infile).replace(".txt", "").replace(".", "_")
    filename = outfile.replace(".load", ".txt")
    options = '-H "pattern_name,TF" '

    statement = []
    statement.append(f'''tmp=`mktemp -p {tmp_dir}` &&
                          grep "^MOTIF" {infile} | 
                            sed 's/MOTIF //' |
                            sed 's/(//' |
                            sed 's/)//' |
                            sed 's/\.//' |
                            sed 's/\///' |
                            sed 's/\\///' |
                            sed 's/://' |
                            sed 's/_//' |
                            tr -s "[[:blank:]]" "\\t" 
                            > $tmp && 
                            cat $tmp | ''')

    # use sed to remove problematic chars e.g. ".", "(" etc. from motif names

    statement.append(
        P.build_load_statement(tablename, options=options, retry=True))

    statement.append(f''' > {outfile} &&
                         mv $tmp {filename}''')

    statement = ' '.join(statement)

    to_cluster = True

    P.run(statement)
def loadBigWigStats(infiles, outfile):
    '''merge and load bigwig summary for all wiggle files.

    Summarise and merge bigwig files for all samples and load into a
    table called bigwig_stats

    Parameters
    ----------
    infiles : list
       Input filenames in :term:`bigwig` format
    outfile : string
        Output filename, the table name is derived from `outfile`.
    '''

    data = " ".join([
        '<( bigWigInfo %s | perl -p -e "s/:/\\t/; s/ //g; s/,//g")' % x
        for x in infiles
    ])
    headers = ",".join([P.snip(os.path.basename(x), ".bw") for x in infiles])

    load_statement = P.build_load_statement(P.toTable(outfile),
                                            options="--add-index=track")

    statement = '''cgat combine_tables
    --header-names=%(headers)s
    --skip-titles
    --missing-value=0
    --ignore-empty
    %(data)s
    | perl -p -e "s/bin/track/"
    | cgat table2table --transpose
    | %(load_statement)s
    > %(outfile)s
    '''

    P.run()
Exemple #15
0
def loadTranscripts(infile, outfile):
    '''load transcripts from a GTF file into the database.

    The table will be indexed on ``gene_id`` and ``transcript_id``

    Arguments
    ---------
    infile : string
       ENSEMBL geneset in :term:`gtf` format.
    outfile : string
       Logfile. The table name is derived from `outfile`.

    '''
    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--add-index=gene_id "
                                            "--add-index=transcript_id "
                                            "--allow-empty-file ")

    statement = '''
    gunzip < %(infile)s
    | cgat gtf2tsv -f
    | %(load_statement)s
    > %(outfile)s'''
    P.run(statement, job_memory=PARAMS["job_highmemory"])
Exemple #16
0
def loadFimo(infile, outfile):
    '''load fimo results from tsv'''

    fimo_result = infile.replace(".fimo.log", "/fimo.txt")

    # escape clause if fimo fails to find motifs
    if len(pd.read_csv(fimo_result, sep="\t")) == 0:
        statement = f'''echo "file {fimo_result} empty" '''

    else:
        tablename = '_'.join([outfile.split("/")[1],
                              "fimo_results"]).replace(".", "_")
        opts = '-H "pattern_name,sequence_name,start,stop,strand,score,p_value,q_value,matched_sequence" '

        statement = []
        statement.append(f'''grep -v "#" {fimo_result} | ''')
        statement.append(
            P.build_load_statement(tablename, options=opts, retry=True))
        statement.append(f''' > {outfile}''')
        statement = ' '.join(statement)

        to_cluster = True

    P.run(statement, job_memory="4G")
Exemple #17
0
def loadBAMStats(infiles, outfile):
    '''load output of :func:`buildBAMStats` into database.

    Arguments
    ---------
    infiles : string
        Input files, output from :func:`buildBAMStats`.
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    header = ",".join([P.snip(os.path.basename(x), ".readstats")
                       for x in infiles])
    filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles])
    tablename = P.to_table(outfile)

    load_statement = P.build_load_statement(
        tablename,
        options="--add-index=track "
        " --allow-empty-file")

    E.info("loading bam stats - summary")
    statement = """cgat combine_tables
    --header-names=%(header)s
    --missing-value=0
    --ignore-empty
    %(filenames)s
    | perl -p -e "s/bin/track/"
    | cgat table2table --transpose
    | %(load_statement)s
    > %(outfile)s"""

    to_cluster = False

    P.run(statement)

    for suffix in ("nm", "nh"):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])

        load_statement = P.build_load_statement(
            "%s_%s" % (tablename, suffix),
            options="--allow-empty-file")

        statement = """cgat combine_tables
        --header-names=%(header)s
        --skip-titles
        --missing-value=0
        --ignore-empty
        %(filenames)s
        | perl -p -e "s/bin/%(suffix)s/"
        | %(load_statement)s
        >> %(outfile)s """

        to_cluster = False

        P.run(statement)

    # load mapping qualities, there are two columns per row
    # 'all_reads' and 'filtered_reads'
    # Here, only filtered_reads are used (--take=3)
    for suffix in ("mapq",):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])

        load_statement = P.build_load_statement(
            "%s_%s" % (tablename, suffix),
            options=" --allow-empty-file")

        statement = """cgat combine_tables
        --header-names=%(header)s
        --skip-titles
        --missing-value=0
        --ignore-empty
        --take=3
        %(filenames)s
        | perl -p -e "s/bin/%(suffix)s/"
        | %(load_statement)s
        >> %(outfile)s """

        to_cluster = False

        P.run(statement)
Exemple #18
0
def loadPicardHistogram(infiles, outfile, suffix, column,
                        pipeline_suffix=".picard_stats", tablename=False):
    '''extract a histogram from a picard output file and load
    it into database.

    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    column : string
        Column name to take from the histogram.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``to_table(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.to_table(outfile), suffix)
        tablename = tablename.replace("_metrics", "_histogram")

    # some files might be missing
    xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

    if len(xfiles) == 0:
        E.warn("no files for %s" % tablename)
        return

    header = ",".join([P.snip(os.path.basename(x), pipeline_suffix)
                       for x in xfiles])
    filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

    # there might be a variable number of columns in the tables
    # only take the first ignoring the rest

    load_statement = P.build_load_statement(
        tablename,
        options="--add-index=track "
        " --header-names=%s,%s"
        " --allow-empty-file"
        " --replace-header" % (column, header))

    statement = """cgat combine_tables
    --regex-start="## HISTOGRAM"
    --missing-value=0
    --take=2
    %(filenames)s
    | %(load_statement)s
    >> %(outfile)s
    """

    to_cluster = False

    P.run(statement)
def loadHypergeometricAnalysis(infile, outfile):
    '''load GO results.'''

    track = P.toTable(outfile)
    tablename = 'hypergeometric_%s_summary' % track
    P.load(infile, outfile, tablename=tablename)

    dbh = connect()
    ontologies = [x[0] for x in Database.executewait(
        dbh,
        '''SELECT DISTINCT ontology FROM %s''' % tablename).fetchall()]

    genelists = [x[0] for x in Database.executewait(
        dbh,
        '''SELECT DISTINCT genelist FROM %s''' % tablename).fetchall()]

    # output files from runGO.py
    sections = ('results', 'parameters', 'withgenes')

    for section in sections:
        tablename = 'hypergeometric_%s_%s' % (track, section)
        load_statement = P.build_load_statement(
            tablename=tablename)

        statement = '''
        cgat combine_tables
        --cat=track
        --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s"
        hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s
        | %(load_statement)s
        >> %(outfile)s'''
        P.run()

    for ontology in ontologies:

        fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology)

        if not os.path.exists(fn):
            E.warn("file %s does not exist" % fn)
            continue

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l2fold' % (track, ontology),
               options='--allow-empty-file')

        fn = os.path.join(
            infile + ".dir", "all_alldesc.%s.l10pvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology),
               options='--allow-empty-file')

        fn = os.path.join(
            infile + ".dir", "all_alldesc.%s.l10qvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology),
               options='--allow-empty-file')