def collect_data(infile, genes):

    sample_data = defaultdict(dict)

    with open(infile, "rU") as casefile:
        reader = csv.reader(casefile, dialect="excel-tab")
        reader.next()
        for row in reader:
            if row[0].startswith("#"):
                sys.stderr.write("WARNING: Skipping commented input line\n")
                continue

            print row
            profile_data = cbioportal.get_multi_gene(row[1], row[2], genes)
            clin_data = cbioportal.get_clin_data(row[1])

            profile_header = profile_data.pop(0)
            clin_header = clin_data.pop(0)

            profile_header_data = profile_header.split()
            profile_header_data.pop(0)
            profile_header_data.pop(0)

            if len(profile_data) <= 0:
                sys.stderr.write(
                    "ERROR: No profile data retrieved for query, " "with response header: {}\n".format(profile_header)
                )
                continue

            if len(clin_data) <= 0:
                sys.stderr.write(
                    "ERROR: No clinical data retrieved for query, " "with response header: {}\n".format(clin_header)
                )
                continue

            for line in profile_data:
                data = line.split()
                data.pop(0)
                gene_id = data.pop(0)

                # print gene_id

                i = 0
                for value in data:
                    # sys.stdout.write("{}: {}\n".format(profile_header_data[i], value))
                    sample_data[profile_header_data[i]][gene_id] = value
                    i += 1

            for line in clin_data:
                if not line.strip():
                    continue
                data = line.split()
                sample_data[data[0]]["AGE"] = data[1]

    return sample_data
Ejemplo n.º 2
0
def collect_data(infile, genes):

    sample_data = defaultdict(dict)

    with open(infile, 'rU') as casefile:
        reader = csv.reader(casefile, dialect='excel-tab')
        reader.next()
        for row in reader:
            if row[0].startswith("#"):
                sys.stderr.write("WARNING: Skipping commented input line\n")
                continue

            print row
            profile_data = cbioportal.get_multi_gene(row[1], row[2], genes)
            clin_data = cbioportal.get_clin_data(row[1])

            profile_header = profile_data.pop(0)
            clin_header = clin_data.pop(0)

            profile_header_data = profile_header.split()
            profile_header_data.pop(0)
            profile_header_data.pop(0)

            if len(profile_data) <= 0:
                sys.stderr.write("ERROR: No profile data retrieved for query, "
                                 "with response header: {}\n".format(profile_header))
                continue

            if len(clin_data) <= 0:
                sys.stderr.write("ERROR: No clinical data retrieved for query, "
                                 "with response header: {}\n".format(clin_header))
                continue

            for line in profile_data:
                data = line.split()
                data.pop(0)
                gene_id = data.pop(0)

                # print gene_id

                i = 0
                for value in data:
                    # sys.stdout.write("{}: {}\n".format(profile_header_data[i], value))
                    sample_data[profile_header_data[i]][gene_id] = value
                    i += 1

            for line in clin_data:
                if not line.strip():
                    continue
                data = line.split()
                sample_data[data[0]]['AGE'] = data[1]

    return sample_data
    parser.add_argument('-c', '--cases', help="Input file with study, case, and profile ids [Required]")
    parser.add_argument('-g', '--genes', help="Text file with list of genes to evaluate for correlations")
    parser.add_argument('-o', '--output', help='Output file name')

    args = parser.parse_args()

    with open(args.genes, 'rU') as genefile:
        genes = genefile.read().splitlines()

    with open(args.output, 'w') as outfile:
        outfile.write("Study\tCase List\tProfile\tGenes\tNumbers\tSpearman's Rho\tP-Value\n")
        with open(args.cases, 'rU') as casefile:
            reader = csv.reader(casefile, dialect='excel-tab')
            reader.next()
            for row in reader:
                profile_data = cbioportal.get_multi_gene(row[1], row[2], genes)
                header = profile_data.pop(0)

                for pair in itertools.combinations(profile_data, 2):
                    data1 = pair[0].split()
                    data2 = pair[1].split()

                    # Remove Gene ID
                    data1.pop(0)
                    data2.pop(0)

                    gene1 = data1.pop(0)
                    gene2 = data2.pop(0)

                    expression1 = list()
                    expression2 = list()
        outfile.write(
            "Study\tCase List\tProfile\tGenes\tMin Exp\t25th Pctl\tMedian\t75th Pctl\tMax\t"
            "Bin1 (Low) #\tBin1 R\tBin1 p\t"
            "Bin2 #\tBin2 R\tBin2 p\t"
            "Bin3 #\tBin3 R\tBin3 p\t"
            "Bin4 (High) #\tBin4 R\tBin4 p\n")
        with open(args.cases, 'rU') as casefile:
            reader = csv.reader(casefile, dialect='excel-tab')
            reader.next()
            for row in reader:
                if row[0].startswith("#"):
                    sys.stderr.write(
                        "WARNING: Skipping commented input line\n")
                    continue

                profile_data = cbioportal.get_multi_gene(row[1], row[2], genes)
                header = profile_data.pop(0)

                if len(profile_data) <= 0:
                    sys.stderr.write(
                        "ERROR: No data retrieved for query, with response header: {}\n"
                        .format(header))
                    continue

                # Because gene expression data is returned in alphabetical order we have to find the line
                # containing our control gene data line, remove it from the data_lines, and isolated it for comparisons
                primary_data = list()
                i = 0
                for line in profile_data:
                    # print line
                    data = line.split()