def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) accessions = [] if args.infile: accessions = [line.rstrip("\n") for line in open(args.infile)] elif args.query: data = encodedcc.get_ENCODE(args.query, connection).get("@graph", []) for exp in data: files = exp.get("files", []) for f in files: res = encodedcc.get_ENCODE(f, connection) f_type = res.get("file_format", "") if f_type == "fastq": accessions.append(res["accession"]) elif args.accession: accessions = [args.accession] else: print("No accessions to check") sys.exit(1) for acc in accessions: link = "/files/" + acc + "/@@download/" + acc + ".fastq.gz" for header, sequence, qual_header, quality in encodedcc.fastq_read(connection, uri=link): if args.header: header = header.decode("UTF-8") print(header) else: sequence = sequence.decode("UTF-8") print(acc + "\t" + str(len(sequence)))
def process_row(row, connection): json_payload = {} flowcell_dict = {} if row.get("file_format", "") == "fastq": for header, sequence, qual_header, quality in encodedcc.fastq_read( connection, filename=row["submitted_file_name"]): sequence = sequence.decode("UTF-8") read_length = len(sequence) json_payload.update({"read_length": read_length}) for key in row.keys(): k = key.split(":") if k[0] in ["flowcell", "machine", "lane", "barcode"]: flowcell_dict[k[0]] = row[k[0]] else: if len(k) > 1: if k[1] in ["int", "integer"]: value = int(row[key]) elif k[1] in ["list", "array"]: value = row[key].strip("[]").split(",") else: value = row[key] if not k[0]: continue try: if type(value) == list: json_payload.update({k[0]: value}) else: json_payload.update({k[0]: json.loads(value)}) except: try: json_payload.update({k[0]: json.loads('"%s"' % (value))}) except: logger.warning( 'Could not convert field %s value %s to JSON' % (k[0], value)) return None if any(flowcell_dict): flowcell_list = [flowcell_dict] json_payload.update({"flowcell_details": flowcell_list}) if type(json_payload.get("paired_end")) == int: if json_payload["paired_end"] == 1: json_payload.pop("paired_with", None) json_payload["paired_end"] = str(json_payload["paired_end"]) print(json_payload) return json_payload
def process_row(row, connection): json_payload = {} flowcell_dict = {} if row.get("file_format", "") == "fastq": for header, sequence, qual_header, quality in encodedcc.fastq_read(connection, filename=row["submitted_file_name"]): sequence = sequence.decode("UTF-8") read_length = len(sequence) json_payload.update({"read_length": read_length}) for key in row.keys(): k = key.split(":") if k[0] in ["flowcell", "machine", "lane", "barcode"]: flowcell_dict[k[0]] = row[k[0]] else: if len(k) > 1: if k[1] in ["int", "integer"]: value = int(row[key]) elif k[1] in ["list", "array"]: value = row[key].strip("[]").split(",") else: value = row[key] if not k[0]: continue try: if type(value) == list: json_payload.update({k[0]: value}) else: json_payload.update({k[0]: json.loads(value)}) except: try: json_payload.update({k[0]: json.loads('"%s"' % (value))}) except: logger.warning('Could not convert field %s value %s to JSON' % (k[0], value)) return None if any(flowcell_dict): flowcell_list = [flowcell_dict] json_payload.update({"flowcell_details": flowcell_list}) if type(json_payload.get("paired_end")) == int: if json_payload["paired_end"] == 1: json_payload.pop("paired_with", None) json_payload["paired_end"] = str(json_payload["paired_end"]) print(json_payload) return json_payload
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) accessions = [] if args.infile: if os.path.isfile(args.infile): accessions = [line.strip() for line in open(args.infile)] else: accessions = args.infile.split(",") elif args.query: data = [] if "search" in args.query: data = encodedcc.get_ENCODE(args.query, connection).get("@graph", []) else: data = [encodedcc.get_ENCODE(args.query, connection)] for exp in data: files = exp.get("files", []) for f in files: res = encodedcc.get_ENCODE(f, connection) f_type = res.get("file_format", "") if f_type == "fastq": accessions.append(res["accession"]) else: print("No accessions to check") sys.exit(1) for acc in accessions: link = "/files/" + acc + "/@@download/" + acc + ".fastq.gz" for header, sequence, qual_header, quality in encodedcc.fastq_read( connection, uri=link): if args.header: header = header.decode("UTF-8") print(acc + "\t" + str(len(sequence)) + "\t" + header) else: sequence = sequence.decode("UTF-8") print(acc + "\t" + str(len(sequence)))