Beispiel #1
0
    def getMatrix4plot(self, interval, binsize = None):
        def appendSeries2matrix(x,matrix,triangle = "both"): #x - series to append into matrix
            if triangle == "both":
                matrix[x["contact_en_bin"], x["contact_st_bin"]] = x["contact_count"]
                matrix[x["contact_st_bin"], x["contact_en_bin"]] = x["contact_count"]
            elif triangle == "upper":
                matrix[x["contact_st_bin"], x["contact_en_bin"]] = x["contact_count"]
            elif triangle == "lower":
                matrix[x["contact_en_bin"], x["contact_st_bin"]] = x["contact_count"]
            else:
                raise

        try:
            self.data
        except:
            logging.error("Please provide the data first")
            return

        if binsize == None: #infer binsize from data
            # dist = pd.unique(self.data["contact_en"]-self.data["contact_st"])
            # sorted_starts = np.sort(self.data["contact_st"].values[:min(1000,len(self.data))])
            # dist2 = np.unique(np.subtract(sorted_starts[1:],sorted_starts[:-1]))
            # assert (dist2 >= 0).all()
            # dist = np.unique(np.concatenate((dist,dist2)))
            # dist = dist[np.nonzero(dist)]
            # assert len(dist) > 0
            # binsize = min(dist)
            binsize = get_bin_size(self.data)
            logging.getLogger(__name__).info("Using binsize "+str(binsize))

        Interval_size_bins = (interval.end - interval.start) // binsize + 1
        matrix = np.zeros(shape=(Interval_size_bins, Interval_size_bins))
        data = self.data.query("@interval.start <= contact_st <= @interval.end &"
                               "@interval.start <= contact_en <= @interval.end")
        data = self.convert2binned(data, interval, binsize)

        try:
            len(self.control)
            with_control = True
        except:
            with_control = False

        if with_control:
            logging.getLogger(__name__).debug("Running with control")
            control = self.control.query("@interval.start <= contact_st <= @interval.end &"
                                   "@interval.start <= contact_en <= @interval.end")
            control = self.convert2binned(control, interval, binsize)
            data.apply(appendSeries2matrix,matrix=matrix,triangle="upper",axis = "columns")
            control.apply(appendSeries2matrix,matrix=matrix,triangle="lower",axis = "columns")
        else:
            data.apply(appendSeries2matrix,matrix=matrix,triangle="both",axis = "columns")

        #remember values for future operations
        self.matrix = matrix
        self.binsize = binsize
        self.interval_size_bins = Interval_size_bins
        self.interval = interval

        return matrix
Beispiel #2
0
 def scc(self, validation_data, predicted, out_dir, **kwargs):
     # if self.apply_log:
     #     print(validation_data["contact_count"])
     #     print(predicted)
     #     validation_data["contact_count"] = validation_data["contact_count"].apply(lambda x: math.exp(x))
     #     predicted = np.exp(np.array(predicted))
     #     print(validation_data["contact_count"])
     #     print(predicted)
     # print(validation_data["chr"])
     chromosome = str(validation_data["chr"][1])
     binsize = str(get_bin_size(validation_data))
     # print("chromosome", chromosome)
     if "h" not in kwargs:
         kwargs["h"] = 2
     else:
         logging.info("for scc using h = " + str(kwargs["h"]))
     if "loop_file" not in kwargs:
         d = pd.concat([
             validation_data["contact_st"], validation_data["contact_en"],
             validation_data["contact_count"],
             pd.DataFrame(predicted)
         ],
                       axis=1)
     else:
         add_loop(validation_data, kwargs["loop_file"])
         d = pd.concat([
             validation_data["contact_st"], validation_data["contact_en"],
             validation_data["contact_count"],
             pd.DataFrame(predicted), validation_data["IsLoop"]
         ],
                       axis=1)
     in_fname = os.path.join(
         out_dir + "scc/", chromosome + "." + binsize + "." +
         self.__represent_validation__()) + ".scc"
     out_fname = in_fname + ".out"
     pd.DataFrame.to_csv(d, in_fname, sep=" ", index=False)
     logging.info(datetime.datetime.now())
     if "p_file" not in kwargs or "e_file" not in kwargs:
         if "interact_pr_en" not in kwargs:
             out = subprocess.check_output([
                 "Rscript", kwargs["scc_file"], in_fname, out_fname,
                 str(kwargs["h"]), chromosome
             ])
         else:
             out = subprocess.check_output([
                 "Rscript", kwargs["scc_file"], in_fname, out_fname,
                 str(kwargs["h"]), chromosome, kwargs["interact_pr_en"]
             ])
     else:
         out = subprocess.check_output([
             "Rscript", kwargs["scc_file"], in_fname, out_fname,
             str(kwargs["h"]), chromosome, kwargs["p_file"],
             kwargs["e_file"]
         ])
     print(str(out))
Beispiel #3
0
def MatPlot2HiC(matplot_obj, fname, out_folder):
    def Pandas2ChrSizes(
        chrsizes_filename, pandas_df
    ):  # This func takes all the chromosomes from pandas object, find out their sizes and write into file
        chromosomes = pandas_df.ix[:, 0].unique()
        chrsizes_table = pd.DataFrame(columns=chromosomes)

        for i in range(len(chromosomes)):
            buf = pandas_df.loc[pandas_df['chr'] == chromosomes[i]][[
                'contact_st', 'contact_en'
            ]]
            max1 = buf.max().max()
            chrsizes_table.at[0, chromosomes[i]] = max1

            print('Completed: {}%'.format(i * 100 // len(chromosomes)),
                  end='\r')

        chr_list = list(chrsizes_table)

        chrsizes_file = open(chrsizes_filename, 'w')

        for j in range(len(chr_list)):
            chrsizes_file.write(chr_list[j] + '\t' +
                                str(chrsizes_table.iloc[0][chr_list[j]]) +
                                '\n')

        chrsizes_file.close()

    def Pandas2Pre(
        pre_filename, pandas_df
    ):  # This func makes pre-HiC file from the pandas object, control or data
        pre_file = open(pre_filename, 'w')
        data_rows = pandas_df.shape[0]

        pandas_df.columns = ["chr1", "start", "end", "count"]
        pandas_df['str1'] = 0
        assert len(pandas_df.loc[(pandas_df['count'] < 0.000001)
                                 & (pandas_df['count'] != 0)]) < (
                                     len(pandas_df['count']) / 10)
        pandas_df['exp'] = pandas_df['count'] * (1000000)
        pandas_df['exp'] = round(pandas_df['exp']).astype(int)

        pandas_df.to_csv(pre_file,
                         sep=" ",
                         columns=[
                             'str1', 'chr1', 'start', 'start', 'str1', 'chr1',
                             'end', 'end', 'exp'
                         ],
                         header=False,
                         index=False)

        pre_file.close()

    # make dirs
    try:
        os.makedirs(out_folder + '/' + fname)
        os.makedirs(out_folder + '/' + fname + '/pre')
        os.makedirs(out_folder + '/' + fname + '/hic')
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    # make filenames
    chromsizes_filename = out_folder + '/' + fname + '/pre/chrom.sizes'
    pre_data_filename = out_folder + '/' + fname + '/pre/pre_data.txt'
    hic_data_filename = out_folder + '/' + fname + '/hic/data.hic'
    pre_control_filename = out_folder + '/' + fname + '/pre/pre_control.txt'
    hic_control_filename = out_folder + '/' + fname + '/hic/control.hic'

    # make chrom.sizes, pre-Hic for data and control
    print('Make chromosome sizes file...')
    time1 = time.time()
    Pandas2ChrSizes(chromsizes_filename, matplot_obj.data)
    time2 = time.time()
    print('Time: ' + str(round(time2 - time1, 3)) + ' sec\n')
    print(colored("[SUCCESS]", 'green') + ' Chromosome sizes file created.\n')

    print('Make data pre-HiC file...')
    time1 = time.time()
    Pandas2Pre(pre_data_filename, matplot_obj.data)
    time2 = time.time()
    print('Time: ' + str(round(time2 - time1, 3)) + ' sec\n')
    print(colored("[SUCCESS]", 'green') + ' DATA pre-HiC file created.\n')

    print('Make control pre-HiC file...')
    time1 = time.time()
    Pandas2Pre(pre_control_filename, matplot_obj.control)
    time2 = time.time()
    matplot_obj.columns = ["chr1", "start", "end", "count"]
    binsize = get_bin_size(matplot_obj.control, fields=["start", "start"])
    print('Time: ' + str(round(time2 - time1, 3)) + ' sec\n')
    print(colored("[SUCCESS]", 'green') + ' CONTROL pre-HiC file created.\n')

    #call juicer
    subprocess.call([
        'java', '-jar', './juicer_tools.jar', 'pre', pre_data_filename,
        hic_data_filename, chromsizes_filename, '-n', '-r', binsize
    ])
    print(colored("[SUCCESS]", 'green') + ' DATA HiC file created.\n')

    subprocess.call([
        'java', '-jar', './juicer_tools.jar', 'pre', pre_control_filename,
        hic_control_filename, chromsizes_filename, '-n', '-r', binsize
    ])
    print(colored("[SUCCESS]", 'green') + ' CONTROL HiC file created.\n')
Beispiel #4
0
    def __init__(self, filename):
        # def add_contact(series):
        #     i = int((series["contact_st"] - data_min) // binsize)
        #     j = int((series["contact_en"]- data_min) // binsize)
        #     predicted = series["contact_count"]
        #     real = series["0"]
        #     matrix[i,j] = predicted
        #     matrix[j,i] = real

        logging.basicConfig(format='%(asctime)s %(name)s: %(message)s',
                            datefmt='%I:%M:%S',
                            level=logging.DEBUG)

        assert np.iinfo(np.dtype("uint32")).max > 250000000
        data = pd.read_csv(filename,
                           sep=" ",
                           dtype={
                               "contact_st": np.uint32,
                               "contact_en": np.uint32,
                               "contact_count": np.float32,
                               "0": np.float32,
                               "IsLoop": np.uint8
                           })
        logging.getLogger(__name__).debug("Reading data")
        data_min, data_max = data["contact_st"].min(), data["contact_en"].max()
        binsize = int(get_bin_size(data))
        logging.getLogger(__name__).debug("Using bin size " + str(binsize))
        assert data_min % binsize == data_max % binsize == 0
        assert data_max > data_min
        chr_len = int((data_max - data_min) // binsize)
        logging.getLogger(__name__).debug("Data size: " + str(chr_len) +
                                          " bins")
        matrix = np.zeros(shape=(chr_len + 1, chr_len + 1))

        # Fill matrix
        logging.getLogger(__name__).debug("Filling matrix")

        data["contact_st"] = ((data["contact_st"] - data_min) //
                              binsize).astype(int)
        data["contact_en"] = ((data["contact_en"] - data_min) //
                              binsize).astype(int)
        i = data["contact_st"].values
        j = data["contact_en"].values
        matrix[(i, j)] = data["contact_count"].values
        matrix[(j, i)] = data["0"].values

        #data.apply(add_contact,axis = "columns")

        diag_sums = [np.trace(matrix, i) for i in range(len(matrix))]

        assert diag_sums[0] + diag_sums[
            1] == 0  # check that first 2 diagonals are empty
        assert np.trace(matrix, 1500000 // binsize -
                        1) != 0  # check that we have disctances up to 1.5 Mb

        #TODO apply transformations to data here
        logging.getLogger(__name__).debug("Normalizing data")

        mean = -100  # find mean along first non-zero diagonal
        for ind, val in enumerate(diag_sums):
            if val != 0:
                mean = val / (len(matrix) - ind)

        if mean - 1 <= 1:  # For oe values mean is ~1
            logging.getLogger(__name__).info(
                "Assuming contacts, going to convert to o/e values.")

            for i in range(len(matrix)):
                if diag_sums[i] == 0:
                    continue
                else:
                    for j in range(i, len(matrix)):
                        matrix[j - i, j] = matrix[j - i, j] / diag_sums[i]
                        matrix[j, j - i] = matrix[j, j - i] / diag_sums[i]

        self.split2pairs(matrix,
                         length=1500000 // binsize,
                         step=750000 // binsize)
Beispiel #5
0
                   })

print(data.head())
assert np.all((data["end"] - data["st"]).values >= 0)

#print (data.max(),data.min())
null_values_count = data.isnull().values.sum()
if null_values_count != 0:
    print("Found null values")
    print(null_values_count, " out of ", len(data))
    data = data.dropna()

assert not data.isnull().values.any()

assert not data.isna().values.any()
binsize = get_bin_size(data, fields=["st", "end"])

data["end"] = data["end"] // binsize
data["st"] = data["st"] // binsize

groupped = data.groupby(by="chr")
boundaries = []

length = 1500000 // binsize
step = length // 5
# end = min(start+length+step*10,len(array) - length)

for chr, groupped_data in groupped:
    start = 0
    print("Computing chr ", chr)
    result = get_compartments(groupped_data, binsize, start, length, step)