def parse(tab_file, anno_id, genome_id, db_name, tab_type): entries = {} file_handler_2 = open(tab_file, "r") entry = None for line in file_handler_2: values = line.split() if tab_type[0] == str(values[1]): entry = {} start = int(values[3]) stop = int(values[4]) seq_id = values[2] strand = "+" name = values[0] if start >= stop: h = start start = stop stop = h strand = "-" start = str(start) stop = str(stop) entry["seq_lower_coor"] = start entry["seq_upper_coor"] = stop entry["name"] = name entry["seq_id"] = seq_id entry["freq_coor"] = start + ";" + stop entry["strand"] = strand entries[entry["name"]] = entry if len(entries) > 0: sqlite_methods.saveGFFTypeElementsInDb(entries, db_name, genome_id, anno_id, "")
def parse_gff(gff3_file, db_name, allowed_gff3_types, genome_id, anno_id, anno_id_overwrite): #, min_chromosome_length): try: #min_chromosome_length = check_min_chromosome_length_db(db_name, genome_id, min_chromosome_length) dic_indexes_parents = {} nodes_line_indexes = {} gff3_types = getGFF3Types(gff3_file) gff3_relation_map = getGFF3TypesHierarchy(gff3_file, gff3_types) for key in gff3_relation_map.keys(): gff3_relation_map[key] = gff3_relation_map[key].keys() hierarchy = {} for allowed_gff3_type1 in allowed_gff3_types: for allowed_gff3_type2 in allowed_gff3_types: paths = find_all_paths(gff3_relation_map, allowed_gff3_type1, allowed_gff3_type2) for path in paths: path_possible = True for allowed_gff3_type3 in allowed_gff3_types: try: (path.index(allowed_gff3_type3) == -1) except Exception: path_possible = False if(path_possible): level = 0 for node in path: level = level + 1 if(hierarchy.get(level) == None): hierarchy[level] = {} hierarchy[level][node] = 1 assert(len(paths[0]) == len(path)) gff3_children = hierarchy.get(len(hierarchy)) gff3_parents = hierarchy.get(len(hierarchy) - 1) index_list_parents_parents_allowed = extractElementsToRemove(gff3_file, hierarchy) index_list_parents = extractElements(gff3_file, gff3_parents, index_list_parents_parents_allowed) index_list_parents = extractFragments(gff3_file, gff3_children, index_list_parents) sqlite_methods.saveGFFTypeElementsInDb(index_list_parents, db_name, genome_id, anno_id, anno_id_overwrite) except Exception: print "ERROR", "parse_gff", sys.exc_info()
def parse(tab_file, anno_id, genome_id, db_name, tab_type): entries = {} file_handler_2 = open(tab_file, "r") entry = None for line in file_handler_2: values = line.split() if (tab_type[0] == str(values[1])): entry = {} start = int(values[3]) stop = int(values[4]) seq_id = values[2] strand = "+" name = values[0] if (start >= stop): h = start start = stop stop = h strand = "-" start = str(start) stop = str(stop) entry["seq_lower_coor"] = start entry["seq_upper_coor"] = stop entry["name"] = name entry["seq_id"] = seq_id entry["freq_coor"] = start + ";" + stop entry["strand"] = strand entries[entry["name"]] = entry if (len(entries) > 0): sqlite_methods.saveGFFTypeElementsInDb(entries, db_name, genome_id, anno_id, "")