def guess_filetype(track_dict, available_tracks): """ :param track_dict: dictionary of track values with the 'file' key containing a string path of the file or files. Only the ending of the last file is used in case when there are more files :param: available_tracks: list of available tracks :return: string file type detected """ file_ = track_dict['file'].strip() file_type = None for track_type, track_class in available_tracks.items(): for ending in track_class.SUPPORTED_ENDINGS: if file_.endswith(ending): if file_type == track_class.TRACK_TYPE: raise InputError("file_type already defined in other" " GenomeTrack") else: file_type = track_class.TRACK_TYPE if file_type is None: raise InputError("Section {}: can not identify file type. Please" " specify the file_type for '{}'" "".format(track_dict['section_name'], file_)) return file_type
def check_file_exists(track_dict, tracks_path, is_hic=False): """ Checks if a file or list of files exists. If the file does not exist tries to check if the file may be relative to the track_file path, in such case the path is updated. :param track_dict: dictionary of track values. Should contain a 'file' key containing the path of the file or files to be checked separated by space For example: file1 file2 file3 :param tracks_path: path of the tracks file :param is_hic: :return: None """ for key in track_dict.keys(): if key.endswith("file"): file_field_name = key # # THIS COULD BE REMOVED IN A NEXT 1.0 VERSION if file_field_name == 'boundaries_file': log.warning("Deprecation Warning: " "The boundaries_file is not used anymore." " It will be ignored." " Please use another track with the" " `overlay_previous` option.\n") # # END file_names = [ x for x in track_dict[file_field_name].split(" ") if x != '' ] full_path_file_names = [] for file_name in file_names: if is_hic and not file_name.endswith('.h5'): file_name_to_check = file_name.split("::")[0] else: file_name_to_check = file_name try: open(file_name_to_check, 'r').close() full_path_file_names.append(file_name) except IOError: try: # try to find the file in the same path as the # track file name_with_tracks_path = tracks_path + "/" + file_name name_with_tracks_path_to_check = tracks_path + "/" + file_name_to_check open(name_with_tracks_path_to_check, 'r').close() full_path_file_names.append(name_with_tracks_path) except IOError: raise InputError( f"File in section [{track_dict['section_name']}] " f"not found:\n{file_name}\n\n") track_dict[file_field_name] = " ".join(full_path_file_names) return track_dict
def __init__(self, file_path, prefered_name="transcript_name", merge_transcripts=True): """ :param file_path: the path of the gtf file :return: """ self.file_type = 'bed12' # list of bed fields self.fields = [ 'chromosome', 'start', 'end', 'name', 'score', 'strand', 'thick_start', 'thick_end', 'rgb', 'block_count', 'block_sizes', 'block_starts' ] self.BedInterval = collections.namedtuple('BedInterval', self.fields) # I think the name which should be written # should be the transcript_name # But we can change it to gene_name self.prefered_name = prefered_name self.merge_transcripts = merge_transcripts # Will process the gtf to get one item per transcript: # This will create a database: try: if _is_sqlite3(file_path): self.db = gffutils.FeatureDB(file_path) else: self.db = gffutils.create_db(file_path, ':memory:') except ValueError as ve: if "No lines parsed" in str(ve): self.length = 0 self.all_transcripts = open(file_path, 'r') else: raise InputError("This is not a gtf file.") else: if self.merge_transcripts: self.length = len( [i for i in self.db.features_of_type("gene")]) self.all_transcripts = self.db.features_of_type( "gene", order_by='start') else: self.length = len( [i for i in self.db.features_of_type("transcript")]) self.all_transcripts = self.db.features_of_type( "transcript", order_by='start')
def check_file_exists(track_dict, tracks_path): """ Checks if a file or list of files exists. If the file does not exists tries to check if the file may be relative to the track_file path, in such case the path is updated. :param track_dict: dictionary of track values. Should contain a 'file' key containing the path of the file or files to be checked separated by space For example: file1 file2 file3 :param tracks_path: path of the tracks file :return: None """ for key in track_dict.keys(): if key.endswith("file"): file_field_name = key # # THIS COULD BE REMOVED IN A NEXT 1.0 VERSION if file_field_name == 'boundaries_file': log.warn("The boundaries_file is not used anymore" " please use another track with the" " `overlay_previous` option") # # END file_names = [ x for x in track_dict[file_field_name].split(" ") if x != '' ] full_path_file_names = [] for file_name in file_names: try: open(file_name, 'r').close() full_path_file_names.append(file_name) except IOError: try: # try to find the file in the same path as the # the path of the name_with_tracks_path = tracks_path + "/" + file_name open(name_with_tracks_path, 'r').close() full_path_file_names.append(name_with_tracks_path) except IOError: raise InputError("File in section [{}] " "not found:\n{}\n\n" "".format( track_dict['section_name'], file_name)) track_dict[file_field_name] = " ".join(full_path_file_names) return track_dict
def process_link_file(self): # the file format expected is similar to file format of links in # circos: # chr1 100 200 chr1 250 300 0.5 # where the last value is a score. valid_intervals = 0 interval_tree = {} line_number = 0 has_score = True max_score = float('-inf') min_score = float('inf') with open(self.properties['file'], 'r') as file_h: for line in file_h.readlines(): line_number += 1 if line.startswith('browser') or line.startswith( 'track') or line.startswith('#'): continue try: chrom1, start1, end1, chrom2, start2, end2 = line.strip( ).split('\t')[:6] except Exception as detail: raise InputError( 'File not valid. The format is chrom1 start1, end1, ' 'chrom2, start2, end2\nError: {}\n in line\n {}'. format(detail, line)) try: score = line.strip().split('\t')[6] except IndexError: has_score = False score = np.nan try: start1 = int(start1) end1 = int(end1) start2 = int(start2) end2 = int(end2) except ValueError as detail: raise InputError( "Error reading line: {}. One of the fields is not " "an integer.\nError message: {}".format( line_number, detail)) assert start1 <= end1, "Error in line #{}, end1 larger than start1 in {}".format( line_number, line) assert start2 <= end2, "Error in line #{}, end2 larger than start2 in {}".format( line_number, line) if has_score: try: score = float(score) except ValueError as detail: self.log.warning( "Warning: reading line: {}. The score is not valid {} will not be used. " "\nError message: {}".format( line_number, score, detail)) score = np.nan has_score = False else: if score < min_score: min_score = score if score > max_score: max_score = score if chrom1 != chrom2: self.log.warning( "Only links in same chromosome are used. Skipping line\n{}\n" .format(line)) continue if chrom1 not in interval_tree: interval_tree[chrom1] = IntervalTree() if start2 < start1: start1, start2 = start2, start1 end1, end2 = end2, end1 # each interval spans from the smallest start to the largest end interval_tree[chrom1].add( Interval(start1, end2, [start1, end1, start2, end2, score])) valid_intervals += 1 if valid_intervals == 0: self.log.warning("No valid intervals were found in file {}".format( self.properties['file'])) file_h.close() return (interval_tree, min_score, max_score, has_score)
def process_link_file(self, plot_regions): # the file format expected is similar to file format of links in # circos: # chr1 100 200 chr1 250 300 0.5 # where the last value is a score. if plot_regions is None: file_to_open = self.properties['file'] else: # To be sure we do not miss links we will intersect with bed with # only chromosomes used in plot_regions plot_regions_adapted = [(chrom, 0, HUGE_NUMBER) for chrom, __, __ in plot_regions] file_to_open = temp_file_from_intersect(self.properties['file'], plot_regions_adapted) valid_intervals = 0 interval_tree = {} line_number = 0 has_score = True max_score = float('-inf') min_score = float('inf') file_h = opener(file_to_open) for line in tqdm(file_h.readlines()): line_number += 1 line = to_string(line) if line.startswith('browser') or line.startswith('track') or line.startswith('#'): continue try: chrom1, start1, end1, chrom2, start2, end2 = line.strip().split('\t')[:6] except Exception as detail: raise InputError('File not valid. The format is chrom1' ' start1, end1, ' f'chrom2, start2, end2\nError: {detail}\n' f' in line\n {line}') if chrom1 != chrom2: self.log.warning(f"Only links in same chromosome are used. Skipping line\n{line}\n") continue try: score = line.strip().split('\t')[6] except IndexError: has_score = False score = np.nan try: start1 = int(start1) end1 = int(end1) start2 = int(start2) end2 = int(end2) except ValueError as detail: raise InputError(f"Error reading line: {line_number}. One of the fields is not " f"an integer.\nError message: {detail}") assert start1 <= end1, f"Error in line #{line_number}, end1 larger than start1 in {line}" assert start2 <= end2, f"Error in line #{line_number}, end2 larger than start2 in {line}" if has_score: try: score = float(score) except ValueError as detail: self.log.warning(f"Warning: reading line: {line}. The score is not valid {score} will not be used. " f"\nError message: {detail}\n") score = np.nan has_score = False else: if score < min_score: min_score = score if score > max_score: max_score = score if chrom1 not in interval_tree: interval_tree[chrom1] = IntervalTree() if start2 < start1: start1, start2 = start2, start1 end1, end2 = end2, end1 if self.properties['use_middle']: mid1 = (start1 + end1) / 2 mid2 = (start2 + end2) / 2 interval_tree[chrom1].add(Interval(mid1, mid2, [start1, end1, start2, end2, score])) else: # each interval spans from the smallest start to the largest end interval_tree[chrom1].add(Interval(start1, end2, [start1, end1, start2, end2, score])) valid_intervals += 1 if valid_intervals == 0: self.log.warning(f"No valid intervals were found in file {self.properties['file']}.\n") file_h.close() return(interval_tree, min_score, max_score, has_score)
def parse_tracks(self, tracks_file): """ Parses a configuration file :param tracks_file: file path containing the track configuration :return: array of dictionaries and vlines_file. One dictionary per track """ parser = ConfigParser(dict_type=MultiDict, strict=False) parser.read_file(open(tracks_file, 'r')) tracks_file_path = os.path.dirname(tracks_file) track_list = [] for section_name in parser.sections(): # track_options is what will become the self.properties track_options = dict({"section_name": section_name}) all_keywords = [i[0] for i in parser.items(section_name)] # First we check if there is a skip set to true: if 'skip' in all_keywords and \ parser.getboolean(section_name, 'skip'): # In this case we just do not explore the section continue # Then the vlines are treated differently: if ('type', 'vlines') in parser.items(section_name): # The only thing to check is the file # There is no other parameters to use. if 'file' not in all_keywords: raise InputError("The section {} is supposed to be a vline" " but there is no file." "".format(section_name)) track_options['file'] = parser.get(section_name, 'file') if len(all_keywords) > 2: extra_keywords = [ k for k in all_keywords if k not in ['file', 'type'] ] log.warn("These parameters were specified but will not" " be used {}".format(' '.join(extra_keywords))) self.vlines_properties = \ self.check_file_exists(track_options, tracks_file_path) continue # For the other cases, we will append properties dictionnaries # to the track_list # If the sections are spacer or x-axis we fill the file_type: # (They are special sections where the title defines the track type) if section_name.endswith('[spacer]'): track_options['file_type'] = 'spacer' track_options['track_class'] = SpacerTrack elif section_name.endswith('[x-axis]'): track_options['file_type'] = 'x_axis' track_options['track_class'] = XAxisTrack # For the others we need to have a 'file_type' # Either the file_type is part of the keywords elif 'file_type' in all_keywords: track_options['file_type'] = parser.get( section_name, 'file_type') if track_options['file_type'] not in self.available_tracks: raise InputError("Section {}: the file_type {} does not" " exists.\npossible file_type are:{}." "".format(section_name, track_options['file_type'], self.available_tracks.keys())) track_options['track_class'] = \ self.available_tracks[track_options['file_type']] # Or we guess it from the file: elif 'file' in all_keywords: track_options['file'] = parser.get(section_name, 'file') track_options['file_type'] = \ self.guess_filetype(track_options, self.available_tracks) track_options['track_class'] = \ self.available_tracks[track_options['file_type']] else: raise InputError("Section {}: there is no file_type nor file " "specified and it is not a [spacer] nor a " "[x-axis] section. This is not a valid " "section.".format(section_name)) # Now we should have a 'track_class' set. # We can get for it all the necessary and possible keywords track_class = track_options['track_class'] NECESSARY_PROPERTIES = track_class.NECESSARY_PROPERTIES for necessary_name in NECESSARY_PROPERTIES: if necessary_name not in all_keywords: raise InputError("The section {} is describing a object of" " type {} but the necessary property {}" " is not part of the config file." "".format(section_name, track_class, necessary_name)) unused_keys = [] # Now we can proceed with the keywords: for name, value in parser.items(section_name): # To be removed in the next 1.0 version if ' ' in name: old_name = name name = '_'.join(name.split(' ')) log.warn("Deprecated Warning: The section {} uses" " parameter {} but there is no more parameter" " with space in name. Will be substituted by {}." "".format(section_name, old_name, name)) else: old_name = name # end SYNONYMOUS_PROPERTIES = track_class.SYNONYMOUS_PROPERTIES # If the name is part of the synonymous we substitute by # the synonymous value if name in SYNONYMOUS_PROPERTIES and \ value in SYNONYMOUS_PROPERTIES[name]: track_options[name] = SYNONYMOUS_PROPERTIES[name][value] elif name in track_class.STRING_PROPERTIES: track_options[name] = value elif name in track_class.BOOLEAN_PROPERTIES: try: # I need to use old_name here else I get a KeyError: track_options[name] = parser.getboolean( section_name, old_name) # In the next 1.0 should be: # track_options[name] = parser.getboolean(section_name, # name) except ValueError: raise InputError("In section {}, {} was set to {}" " whereas we should have a boolean " "value. Please, use true or false." "".format(section_name, old_name, value)) # In the next 1.0 should be: # "".format(section_name, name, if value.lower() not in ['true', 'false']: log.warning("Deprecation Warning: " "In section {}, {} was set to {}" " whereas in the future only" " true and false value will be" " accepted".format(section_name, name, value)) elif name in track_class.FLOAT_PROPERTIES: try: track_options[name] = float(value) except ValueError: raise InputError("In section {}, {} was set to {}" " whereas we should have a float " "value.".format( section_name, name, value)) min_value, max_value = track_class.FLOAT_PROPERTIES[name] if track_options[name] < min_value or \ track_options[name] > max_value: raise InputError("In section {}, {} was set to {}" " whereas it should be between {} and" " {}.".format(section_name, name, value, min_value, max_value)) elif name in track_class.INTEGER_PROPERTIES: try: track_options[name] = int(value) except ValueError: raise InputError("In section {}, {} was set to {}" " whereas we should have an integer " "value.".format( section_name, name, value)) min_value, max_value = track_class.INTEGER_PROPERTIES[name] if track_options[name] < min_value or \ track_options[name] > max_value: raise InputError("In section {}, {} was set to {}" " whereas it should be between {} and" " {}.".format(section_name, name, value, min_value, max_value)) else: unused_keys.append(name) # If there are unused keys they are printed in a warning. if len(unused_keys) > 0: log.warn("In section {}, these parameters are unused:" "{}".format(section_name, unused_keys)) # The track_options will be checked for the file paths: track_options = self.check_file_exists(track_options, tracks_file_path) # The 'overlay_previous' is initialized: if 'overlay_previous' not in track_options: track_options['overlay_previous'] = 'no' if track_options['overlay_previous'] not in [ 'no', 'yes', 'share-y' ]: raise InputError( "In section {}, overlay_previous was set to {}." " Possible options are no, yes, share-y" "".format(section_name, track_options['overlay_previous'])) # If there is no title: if 'title' not in track_options: track_options['title'] = '' if track_options['overlay_previous'] == 'no' and \ track_options['track_class'] not in [SpacerTrack, XAxisTrack]: log.warn("title not set for section {}" "\n".format(track_options['section_name'])) # The track_options are added to the track_list track_list.append(track_options) # Now that they were all checked self.track_list = track_list if self.vlines_properties: self.vlines_intval_tree, __, __ = \ file_to_intervaltree(self.vlines_properties['file'])
def parse_tracks(self, tracks_file, plot_regions=None): """ Parses a configuration file :param tracks_file: file path containing the track configuration :param plot_regions: a list of tuple [(chrom1, start1, end1), (chrom2, start2, end2)] on which the data should be loaded here the vlines :return: array of dictionaries and vlines_file. One dictionary per track """ try: if isinstance(tracks_file, str) and not pathlib.Path(tracks_file).exists(): # assume the tracks_file are the config string import io handle = io.StringIO(tracks_file) else: handle = open(tracks_file, 'r') except OSError: # if the config string too long, path exists raise error. import io handle = io.StringIO(tracks_file) parser = ConfigParser(dict_type=MultiDict, strict=False) parser.read_file(handle) handle.close() tracks_file_path = os.path.dirname(tracks_file) track_list = [] for section_name in parser.sections(): # track_options is what will become the self.properties track_options = dict({"section_name": section_name}) all_keywords = [i[0] for i in parser.items(section_name)] # First we check if there is a skip set to true: if 'skip' in all_keywords and \ parser.getboolean(section_name, 'skip'): # In this case we just do not explore the section continue # Then the vlines are treated differently: if ('type', 'vlines') in parser.items(section_name): # The only thing to check is the file # There is no other parameters to use. if 'file' not in all_keywords: raise InputError( f"The section {section_name} is supposed to be a vline" " but there is no file.") track_options['file'] = parser.get(section_name, 'file') if 'line_width' in all_keywords: try: track_options['line_width'] = float( parser.get(section_name, 'line_width')) except ValueError: raise InputError( f"In section {section_name}, line_width " f"was set to {parser.get(section_name, 'line_width')}" " whereas we should have a float " "value.") extra_keywords = [ k for k in all_keywords if k not in ['file', 'type', 'line_width'] ] if len(extra_keywords) > 0: log.warning("These parameters were specified but will not" f" be used {' '.join(extra_keywords)}.\n") self.vlines_properties = \ self.check_file_exists(track_options, tracks_file_path) continue # For the other cases, we will append properties dictionnaries # to the track_list # If the sections are spacer or x-axis we fill the file_type: # (They are special sections where the title defines the track type) if section_name.endswith('[spacer]'): track_options['file_type'] = 'spacer' track_options['track_class'] = SpacerTrack elif section_name.endswith('[x-axis]'): track_options['file_type'] = 'x_axis' track_options['track_class'] = XAxisTrack # For the others we need to have a 'file_type' # Either the file_type is part of the keywords elif 'file_type' in all_keywords: track_options['file_type'] = parser.get( section_name, 'file_type') if track_options['file_type'] not in self.available_tracks: raise InputError(f"Section {section_name}: the file_type " f"{track_options['file_type']} does not" " exists.\npossible file_type are:" f"{self.available_tracks.keys()}.") track_options['track_class'] = \ self.available_tracks[track_options['file_type']] # Or we guess it from the file: elif 'file' in all_keywords: track_options['file'] = parser.get(section_name, 'file') track_options['file_type'] = \ self.guess_filetype(track_options, self.available_tracks) track_options['track_class'] = \ self.available_tracks[track_options['file_type']] else: raise InputError( f"Section {section_name}: there is no file_type nor file " "specified and it is not a [spacer] nor a " "[x-axis] section. This is not a valid " "section.") # Now we should have a 'track_class' set. # We can get for it all the necessary and possible keywords track_class = track_options['track_class'] NECESSARY_PROPERTIES = track_class.NECESSARY_PROPERTIES for necessary_name in NECESSARY_PROPERTIES: if necessary_name not in all_keywords: raise InputError(f"The section {section_name} is " "describing a object of" f" type {track_class} but the necessary " f"property {necessary_name}" " is not part of the config file.") unused_keys = [] # Now we can proceed with the keywords: for name, value in parser.items(section_name): # To be removed in the next 1.0 version if ' ' in name: old_name = name name = '_'.join(name.split(' ')) log.warning( f"Deprecated Warning: The section {section_name} " f"uses parameter {old_name} but there is no more " "parameter with space in name. " f"Will be substituted by {name}.\n") else: old_name = name # end SYNONYMOUS_PROPERTIES = track_class.SYNONYMOUS_PROPERTIES # If the name is part of the synonymous we substitute by # the synonymous value if name in SYNONYMOUS_PROPERTIES and \ value in SYNONYMOUS_PROPERTIES[name]: track_options[name] = SYNONYMOUS_PROPERTIES[name][value] elif name in track_class.STRING_PROPERTIES: track_options[name] = value elif name in track_class.BOOLEAN_PROPERTIES: try: # I need to use old_name here else I get a KeyError: track_options[name] = parser.getboolean( section_name, old_name) # In the next 1.0 should be: # track_options[name] = parser.getboolean(section_name, # name) except ValueError: raise InputError(f"In section {section_name}, " f"{old_name} was set to {value}" " whereas we should have a boolean " "value. Please, use true or false.") # In the next 1.0 should be: # f"{name} was set to {value}" if value.lower() not in ['true', 'false']: log.warning("Deprecation Warning: " f"In section {section_name}, {name} was " f"set to {value}" " whereas in the future only" " true and false value will be" " accepted.\n") elif name in track_class.FLOAT_PROPERTIES: try: track_options[name] = float(value) except ValueError: raise InputError(f"In section {section_name}, {name} " f"was set to {value}" " whereas we should have a float " "value.") min_value, max_value = track_class.FLOAT_PROPERTIES[name] if track_options[name] < min_value or \ track_options[name] > max_value: raise InputError(f"In section {section_name}, {name} " f"was set to {value}" " whereas it should be between " f"{min_value} and {max_value}.") elif name in track_class.INTEGER_PROPERTIES: try: track_options[name] = int(value) except ValueError: raise InputError(f"In section {section_name}, {name} " f"was set to {value}" " whereas we should have an integer " "value.") min_value, max_value = track_class.INTEGER_PROPERTIES[name] if track_options[name] < min_value or \ track_options[name] > max_value: raise InputError(f"In section {section_name}, {name} " f"was set to {value}" " whereas it should be between " f"{min_value} and {max_value}.") else: unused_keys.append(name) # If there are unused keys they are printed in a warning. if len(unused_keys) > 0: log.warning(f"In section {section_name}, these parameters are " f"unused:{unused_keys}.\n") # The track_options will be checked for the file paths: track_options = self.check_file_exists( track_options, tracks_file_path, track_options['file_type'] == 'hic_matrix') # The 'overlay_previous' is initialized: if 'overlay_previous' not in track_options: track_options['overlay_previous'] = 'no' if track_options['overlay_previous'] not in [ 'no', 'yes', 'share-y' ]: raise InputError( f"In section {section_name}, overlay_previous " f"was set to {track_options['overlay_previous']}." " Possible options are no, yes, share-y") # If there is no title: if 'title' not in track_options: track_options['title'] = '' if track_options['overlay_previous'] == 'no' and \ track_options['track_class'] not in [SpacerTrack, XAxisTrack]: log.warning("title not set for section " f"{track_options['section_name']}\n") # The track_options are added to the track_list track_list.append(track_options) # Now that they were all checked self.track_list = track_list if self.vlines_properties: self.vlines_intval_tree, __, __ = \ file_to_intervaltree(self.vlines_properties['file'], plot_regions)