Beispiel #1
0
def simulate(n_simul, agents, grid_size, candy_ratio=1., max_iter=500):
    print("Simulations")
    wins = dict((id, 0.) for id in range(len(agents)))
    points = dict((id, []) for id in range(len(agents)))
    scores = dict((id, []) for id in range(len(agents)))

    iterations = []
    for it in range(n_simul):
        progressBar(it, n_simul)
        endState = controller(agents,
                              grid_size,
                              candy_ratio=candy_ratio,
                              max_iter=max_iter,
                              verbose=0)
        if len(endState.snakes) == 1:
            wins[list(endState.snakes.keys())[0]] += 1. / n_simul
            points[list(endState.snakes.keys())[0]].append(
                list(endState.snakes.values())[0].points)

        for id in range(len(agents)):
            scores[id].append(endState.scores[id])

        iterations.append(endState.iter)
    progressBar(n_simul, n_simul)
    points = dict((id, sum(val) / len(val)) for id, val in points.items())
    return wins, points, scores, iterations
Beispiel #2
0
    def train_loop(self, dataset, epochs, total_steps):
        for epoch in range(epochs):
            for i, real_images in enumerate(dataset):

                noises = tf.random.normal(shape=(self.batch_size, self.z_dim))
                dis_loss = self.discriminator_train_step(noises, real_images)
                gen_loss = self.generator_train_step(noises)

                progressBar(epoch + 1, i + 1, total_steps, dis_loss.numpy(),
                            gen_loss.numpy())

                if (i + 1) % 50 == 0:
                    self.generate_training_progress_result(
                        self.ckpt.generator, epoch + 1, i + 1)

            dis_loss_epoch = self.dis_metric.result().numpy()
            gen_loss_epoch = self.gen_metric.result().numpy()

            print()
            print(
                f"Epoch {epoch+1} - D-Loss: {dis_loss_epoch} - G-Loss: {gen_loss_epoch}"
            )

            self.ckpt_manager.save()
            self.ckpt.epoch.assign_add(1)
            self.dis_metric.reset_states()
            self.gen_metric.reset_states()
Beispiel #3
0
    def train(self,
              strategies,
              grid_size,
              num_trials=100,
              max_iter=1000,
              verbose=False):
        print "RL training"
        totalRewards = []  # The rewards we get on each trial
        rl_id = len(strategies)
        for trial in xrange(num_trials):
            progressBar(trial, num_trials)
            game = interface.Game(grid_size,
                                  len(strategies) + 1,
                                  candy_ratio=1.,
                                  max_iter=max_iter)
            state = game.startState()
            totalDiscount = 1
            totalReward = 0
            points = state.snakes[rl_id].points
            history = []
            while not game.isEnd(state) and rl_id in state.snakes:
                # Compute the actions for each player following its strategy
                actions = {
                    i: strategies[i](i, state)
                    for i in state.snakes.keys() if i != rl_id
                }
                action, optimal_action = self.getAction(state)
                actions[rl_id] = action

                newState = game.succ(state, actions)
                if rl_id in newState.snakes:
                    reward = newState.snakes[rl_id].points - points
                    if len(newState.snakes) == 1:  # it won
                        reward += 10.
                    points = newState.snakes[rl_id].points
                    self.incorporateFeedback(state, action, reward, newState,
                                             history)
                else:  # it died
                    reward = -10.
                    self.incorporateFeedback(state, action, reward, newState,
                                             history)

                # add decsion to history, or reset if non-greedy choice
                if optimal_action:
                    history.append(self.featureExtractor(state, action))
                else:
                    history = []

                totalReward += totalDiscount * reward
                totalDiscount *= self.discount
                state = newState

            if verbose:
                print "Trial %d (totalReward = %s)" % (trial, totalReward)
            totalRewards.append(totalReward)

        progressBar(num_trials, num_trials)
        print "Average reward:", sum(totalRewards) / num_trials
        return totalRewards
Beispiel #4
0
    def train(self,
              opponents,
              grid_size,
              num_trials=100,
              max_iter=1000,
              verbose=False):
        print("RL training")
        totalRewards = []  # The rewards we get on each trial
        # rl_id = len(opponents)
        rl_agent = self.getAgent()

        agents = deepcopy(opponents)  # add current agent to strategies
        agents.append(rl_agent)

        for trial in range(num_trials):
            # game = interface.Game(grid_size, len(strategies) + 1, candy_ratio = 1., max_iter = max_iter)
            # state = game.startState()
            game = interface.Game(grid_size,
                                  len(agents),
                                  candy_ratio=1.,
                                  max_iter=max_iter)
            game.start(agents)
            totalDiscount = 1
            totalReward = 0
            rewards = []

            while not game.isEnd() and rl_agent.isAlive(game):
                # Compute the actions for each player following its strategy
                # actions = {i: strategies[i](i, state) for i in state.snakes.keys() if i != rl_id}
                # action = self.getAction(state)
                # actions[rl_id] = action
                actions = game.agentActions()
                newState = game.succ(game.current_state, actions)

                reward = rl_agent.lastReward(game)
                rewards.append(reward)

                totalReward += totalDiscount * reward
                totalDiscount *= self.discount

                if self.rl_type == "qlearning":
                    self.incorporateFeedback(game.previous_state,
                                             actions[rl_agent.getPlayerId()],
                                             reward, game.current_state)

            if self.rl_type == "policy_gradients":
                self.addRolloutFeedback(rewards, trial)

            progressBar(trial,
                        num_trials,
                        info="Last reward: {}".format(totalReward))
            if verbose:
                print("Trial %d (totalReward = %s)" % (trial, totalReward))
            totalRewards.append(totalReward)

        progressBar(num_trials, num_trials)
        print("Average reward:", sum(totalRewards) / num_trials)
        return totalRewards
    def fit(self, x, y, x_val, y_val, steps=10000, save_best_only=True):
        sess, saver = self.sess, self.saver
        train, cost = self.train, self.cost
        H_indices, W_indices, Y = self.H_indices, self.W_indices, self.Y
        epochs, cost_function, min_cost = self.epochs, self.cost_function, self.min_cost
        checkpoint, model_name = self.checkpoint, self.model_name

        for epoch in range(0, epochs + 1):
            for step in range(0, steps + 1):
                training_error = sess.run(cost,
                                          feed_dict={
                                              H_indices: x[0],
                                              W_indices: x[1],
                                              Y: y
                                          })
                validation_error = sess.run(cost,
                                            feed_dict={
                                                H_indices: x_val[0],
                                                W_indices: x_val[1],
                                                Y: y_val
                                            })

                progressBar(
                    step, steps,
                    "Epoch : %d / %d, %s(training) : %.4f, %s(test) : %.4f" %
                    (epoch, epochs, cost_function, training_error,
                     cost_function, validation_error))

                sess.run(train,
                         feed_dict={
                             H_indices: x[0],
                             W_indices: x[1],
                             Y: y
                         })

            if not save_best_only or (save_best_only
                                      and min_cost > validation_error):
                filename = model_name + "%03d_%.4f" % (
                    epoch, validation_error) + ".ckpt"
                model_path = os.path.join(checkpoint, filename)
                saver.save(sess, model_path)
                print("Saved model %s" % filename)

        print("Done")
Beispiel #6
0
def main(NUMBERS, TARGET):

    n = len(NUMBERS)

    progress = 0
    solutionsGiven = 0
    SOL_SPACE_SIZE = n * (factorial(n - 1)**2) * (2**(n - 1)
                                                  )  # See attached PDF

    countdown.spinner(NUMBERS, TARGET)

    INDENT = "\t\t\t"

    with open("solutions.txt", "w") as file:
        solution_generator = solver(NUMBERS, TARGET)

        for solution in solution_generator:
            progress += 1

            if solution != "":

                if solutionsGiven <= 100:
                    file.write(solution + "\n")
                    solutionsGiven += 1
                progressFractional = progress / SOL_SPACE_SIZE

                countdown.spinner(NUMBERS, TARGET)
                progressBar(INDENT, progressFractional, solution)
        #   Uncomment line below to see number of operations that ran.
        #   This number is exactly equal to the solution
        #   space calculated!
        # file.write(str(progress))

    with open("solutions.txt", "r") as file:

        solutions = file.readlines()

        if solutions == []:
            print("This problem is impossible!")
            return

        [print(line) for line in solutions[:100]]
Beispiel #7
0
    def train_loop(self, dataset, epochs, total_steps):
        for epoch in range(epochs):

            start = time.time()
            for i, train_images in enumerate(dataset):

                loss = self.train_step(train_images)

                progressBar(epoch + 1, i + 1, total_steps, loss.numpy())

                if (i + 1) % 50 == 0:
                    self.generate_training_progress_result(
                        self.ckpt.decoder, epoch + 1, i + 1)
            stop = time.time()

            print()
            print(
                f"EPOCH: {epoch+1} - LOSS: {self.vae_metric.result().numpy()} - Time: {round(stop-start,2)}s"
            )

            self.ckpt_manager.save()
            self.ckpt.epoch.assign_add(1)
            self.vae_metric.reset_states()
Beispiel #8
0
    def updateTrading212Dailies(self):
        stock_list = pd.read_csv("Trading212US.csv")
        for symbol in progressBar(stock_list['Symbol'], "Updating Dailies: ", "Complete", length=50, decimals=1):
            while True:
                try:
                    data = self.stock_candles(symbol, 'D', time_to_unix(datetime(2019, 8,1,5,0,0)), time_to_unix((datetime.today()-timedelta(1)).replace(hour=23)))
                    break
                except finnhub.exceptions.FinnhubAPIException:
                    time.sleep(10)

            if data['s'] == 'no_data':
                continue
            
            data = self.candle_prep(data)
            data.to_csv("hist_data/{}_Daily.csv".format(symbol))
Beispiel #9
0
    def get_intraday_minutes(self, watchlist, day=None, flag='today'):
        no_data = []
        watch_dict = {}
        end_close = {}

        day = day if day else datetime.today()
        if flag == 'yesterday':
            delta = 3 if day.weekday() == 0 else 1
            day = day - timedelta(delta)

        save_path = Path("minute_data/" + day.date().isoformat())
        save_path.mkdir(parents=True, exist_ok=True)        
        time_start = time_to_unix(day.replace(hour=15,minute=30, second=0))
        time_end = time_to_unix(day.replace(hour=22,minute=0, second=30))

        for symbol in progressBar(watchlist, "Fetching intraday data of {}: ".format(flag), length=50, decimals=1):
            symbol_csv = save_path.joinpath("{}.csv".format(symbol))
            if symbol_csv.is_file():
                watch_dict[symbol] = pd.read_csv(symbol_csv, index_col='Time', parse_dates=True)
                end_close[symbol] = watch_dict[symbol]['Price'].iloc[-1]
                continue

            while True:
                try:
                    data = self.stock_candles(symbol, 1, time_start, time_end)
                    break
                except (finnhub.exceptions.FinnhubAPIException, requests.exceptions.RequestException) as e:
                    print(e)
                    time.sleep(5)

            if data['s'] == 'no_data':
                no_data.append(symbol)
                continue
            
            data = self.minutes_prep(data)
            watch_dict[symbol] = data
            end_close[symbol] = data['Price'].iloc[-1]
            data.to_csv(symbol_csv)

        if len(no_data) > 0:
            print("no intraday data found for these symbols: \n", no_data)   
        return watch_dict, no_data, end_close
                   sep=";")  # Note that ID.txt file is delimited by semicolon
psms = psms[["Peptide", "Outfile", "measuredMH", "XCorr"]]
psms = psms.loc[psms["Outfile"].str.contains(
    mzXMLBaseName)]  # Extract PSMs from FTLD_Batch2_F50.mzXML
psms["precMz"] = np.nan
psms["charge"] = np.nan
psms["featureIndex"] = np.nan
psms["category"] = ""
psms = psms.drop_duplicates()
print("  PSM information has been parsed\n")

# Find the match between features and PSMs
n1, n2 = 0, 0  # n1 = number of PSMs mapped to feature(s), n2 = number of PSMs not mapped to any feature
isoWindow = 1  # Isolation window size for a precursor peak
proton = 1.007276466812
progress = utils.progressBar(psms.shape[0])
for idx, psm in psms.iterrows():
    progress.increment()
    [psmRunName, psmScanNum, _, psmZ,
     _] = os.path.basename(psm["Outfile"]).split(".")
    psms.loc[idx, "charge"] = psmZ
    if psmRunName == mzXMLBaseName:
        # Extract the precursor m/z
        psmScanNum = int(psmScanNum)
        surveyScanNum = ms2ToSurvey[psmScanNum]
        precMz, _, _ = getPrecursorPeak(reader, psmScanNum, surveyScanNum,
                                        params)

        # Assign the feature corresponding to the PSM, based on the precursor m/z and feature's m/z
        if precMz != -1:
            psms.loc[idx, "precMz"] = precMz
accumulated_mae = 0
accumulated_mse = 0

print(test_size_ML, " points to process...")
horizon = 10
for i in range(0, test_size_ML - horizon - 1, test_size_ML // 100):

    x_train = test_data_ML.x[i].reshape(-1, window_size * dim)
    y_train = test_data_ML.y[i]

    temp_kernel = ConstantKernel(
        1.0, (1e-3, 1e3)) * RationalQuadratic() + WhiteKernel()
    gpr = GaussianProcessRegressor(kernel=temp_kernel, random_state=0)
    gpr.fit(x_train, y_train)

    y_pred = gpr.predict(test_data_ML.x[i + 1:i + horizon].reshape(
        -1, window_size * dim))

    temp_mae = mae(y_pred, test_data_ML.y[i + 1:i + horizon].reshape(-1, 1))
    temp_mse = mse(y_pred, test_data_ML.y[i + 1:i + horizon].reshape(-1, 1))

    mae_list.append(temp_mae)
    mse_list.append(temp_mse)

    accumulated_mae = (i / (i + 1)) * accumulated_mae + temp_mae / (i + 1)
    accumulated_mse = (i / (i + 1)) * accumulated_mse + temp_mse / (i + 1)

    progressBar(i, test_size_ML, 100)

print("Accumulated_mae ", accumulated_mae)
print("Accumulated_mse ", accumulated_mse)
Beispiel #12
0
def detectFeatures(inputFile, paramFile):
    ##############
    # Parameters #
    ##############
    params = utils.getParams(paramFile)
    firstScan = int(params["first_scan_extraction"])
    lastScan = int(params["last_scan_extraction"])
    gap = int(params["skipping_scans"])
    scanWindow = gap + 1
    matchPpm = float(params["mass_tolerance_peak_matching"])

    ##################
    # Initialization #
    ##################
    reader = mzxml.read(inputFile)
    f = []  # Feature array
    nFeatures = -1
    cache = []
    noise = {}  # Empty dictionary for noise level information
    oldMinInd = -1
    oldMaxInd = -1

    ############################
    # Get MS1 scan information #
    ############################
    ms = []
    with reader:
        msCount = 0
        # filename = os.path.basename(inputFile)
        # print("  Extraction of MS1 spectra from %s" % filename)
        for spec in reader:
            msLevel = int(spec["msLevel"])
            scanNum = int(spec["num"])
            if msLevel == 1 and firstScan <= scanNum <= lastScan:
                ms.append(spec)
                msCount += 1
            elif scanNum > lastScan:
                break
        # print("  Done")

    ################################
    # Feature (3D-peak) generation #
    ################################
    filename = os.path.basename(inputFile)
    print("  Feature detection from %s" % filename)
    logging.info("  Feature detection from " + filename)
    progress = utils.progressBar(msCount)
    for i in range(msCount):
        progress.increment()
        minInd = max(0, i - gap - 1)
        maxInd = min(msCount - 1, i + gap + 1)
        if i == 0:
            for j in range(maxInd + 1):
                spec = detectPeaks(ms[j], params)
                spec["index"] = j
                cache.append(spec)
        else:
            for j in range(oldMinInd, minInd):
                cache.pop(0)  # Remove the first element in cache
            for j in range(oldMaxInd + 1, maxInd + 1):
                spec = detectPeaks(ms[j], params)
                spec["index"] = j
                cache.append(spec)

        ##################
        # Reduction step #
        ##################
        p = cache[i - minInd]
        pCount = len(p["m/z array"])
        valids = np.array([])
        count = 0
        for j in range(pCount):
            cm = p["m/z array"][j]
            match = 0
            nTry = 0
            # Backward search
            for k in range(i - 1, minInd - 1, -1):
                q = cache[k - minInd]
                if q["m/z array"].size == 0:
                    continue
                else:
                    match, ind = getClosest(q, cm, matchPpm)
                if match == 1:
                    break
                nTry += 1
                if nTry > scanWindow:
                    break
            if match == 0:  # Forward search
                nTry = 0
                for k in range(i + 1, maxInd + 1):
                    q = cache[k - minInd]
                    if q["m/z array"].size == 0:
                        continue
                    else:
                        match, ind = getClosest(q, cm, matchPpm)
                    if match == 1:
                        break
                    nTry += 1
                    if nTry > scanWindow:
                        break
            if match == 1:
                valids = np.append(valids, j)

        # Peak reduction and noise-level estimation
        p, noise = reduceMS1(p, noise, valids)

        #####################
        # Peak merging step #
        #####################
        cache[i - minInd] = p
        pCount = len(p["m/z array"])
        for j in range(pCount):
            cm = p["m/z array"][j]
            match = 0
            nTry = 0
            matchedPeakInd = []
            # Backward search
            for k in range(i - 1, minInd - 1, -1):
                q = cache[k - minInd]
                if q["m/z array"].size == 0:
                    continue
                else:
                    matchIndicator, ind = getClosest(q, cm, matchPpm)
                    # $matchIndicator = 1 means that the j-th (reduced) peak in the i-th scan
                    # can form a 3D-peak with $ind-th (reduced) peak in the previous scan (%q)
                    if matchIndicator == 1:
                        matchedPeakInd.append(q["featureIndex"][ind])
                        match = 1
            if match == 1:
                matchedPeakInd = list(set(matchedPeakInd))  # Make the list unique
                fInd = None
                if len(matchedPeakInd) > 1:  # There are multiple matches to the peaks in previous scans
                    fInd = min(matchedPeakInd)
                    for m in matchedPeakInd:
                        # Merge to the lowest indexed feature and remove the "merged" features
                        if m != fInd:
                            f[fInd]["mz"].extend(f[m]["mz"])
                            f[fInd]["intensity"].extend(f[m]["intensity"])
                            f[fInd]["num"].extend(f[m]["num"])
                            f[fInd]["rt"].extend(f[m]["rt"])
                            f[fInd]["index"].extend(f[m]["index"])

                            # Revise cache array
                            for s in f[m]["index"]:
                                for t in range(len(cache)):
                                    if cache[t]["index"] == s:
                                        for u in range(len(cache[t]["featureIndex"])):
                                            if cache[t]["featureIndex"][u] == m:
                                                cache[t]["featureIndex"][u] = fInd
                            f[m] = None  # Keep the size of feature array
                else:
                    fInd = matchedPeakInd[0]
                if "featureIndex" in cache[i - minInd]:
                    cache[i - minInd]["featureIndex"].append(fInd)
                else:
                    cache[i - minInd]["featureIndex"] = [fInd]
                f[fInd]["mz"].append(p["m/z array"][j])
                f[fInd]["intensity"].append(p["intensity array"][j])
                f[fInd]["num"].append(p["num"])
                f[fInd]["rt"].append(p["retentionTime"])
                f[fInd]["index"].append(p["index"])

            if match != 1:
                if i < msCount:
                    nFeatures += 1
                    if "featureIndex" in cache[i - minInd]:
                        cache[i - minInd]["featureIndex"].append(nFeatures)
                    else:
                        cache[i - minInd]["featureIndex"] = [nFeatures]
                    f.append({"mz": [p["m/z array"][j]],
                              "intensity": [p["intensity array"][j]],
                              "num": [p["num"]],
                              "rt": [p["retentionTime"]],
                              "index": [i]})

        oldMinInd = minInd
        oldMaxInd = maxInd

    # Remove empty features
    f = [i for i in f if i is not None]

    #################################
    # Filtering features (3D-peaks) #
    #################################
    # A feature may contain multiple peaks from one scan
    # In this case, one with the largest intensity is chosen
    gMinRt, gMaxRt = 0, 0  # Global minimum and maximum RT over all features
    for i in range(len(f)):
        if len(f[i]["num"]) != len(list(set(f[i]["num"]))):
            temp = {}
            for j in range(len(f[i]["num"])):
                if f[i]["num"][j] in temp:
                    currIntensity = f[i]["intensity"][j]
                    if currIntensity > temp[f[i]["num"][j]]["intensity"]:
                        temp[f[i]["num"][j]]["intensity"] = currIntensity
                        temp[f[i]["num"][j]]["index"] = j
                else:
                    temp[f[i]["num"][j]] = {}
                    temp[f[i]["num"][j]]["intensity"] = f[i]["intensity"][j]
                    temp[f[i]["num"][j]]["index"] = j
            uInd = []
            for key in sorted(temp.keys()):
                uInd.append(temp[key]["index"])
            f[i]["mz"] = [f[i]["mz"][u] for u in uInd]
            f[i]["intensity"] = [f[i]["intensity"][u] for u in uInd]
            f[i]["num"] = [f[i]["num"][u] for u in uInd]
            f[i]["rt"] = [f[i]["rt"][u] for u in uInd]
            f[i]["index"] = [f[i]["index"][u] for u in uInd]

        if i == 0:
            gMinRt = min(f[i]["rt"])
            gMaxRt = max(f[i]["rt"])
        else:
            if min(f[i]["rt"]) < gMinRt:
                gMinRt = min(f[i]["rt"])
            if max(f[i]["rt"]) > gMaxRt:
                gMaxRt = max(f[i]["rt"])

    if gMaxRt.unit_info == "minute":
        gMaxRt = gMaxRt * 60
        gMinRt = gMinRt * 60

    ###################################
    # Organization of output features #
    ###################################
    n = 0
    ms1ToFeatures = {}
    for i in range(len(f)):
        # 1. mz: mean m/z of a feauture = weighted average of m/z and intensity
        mz = np.sum(np.multiply(f[i]["mz"], f[i]["intensity"])) / np.sum(f[i]["intensity"])

        # 2. intensity: intensity of a feature (maximum intensity among the peaks consist of the feature)
        intensity = max(f[i]["intensity"])

        # 3. z: charge of the feature, set to 1 now, but modified later
        z = 1
        isotope = 0  # Will be used later

        # 4. RT: RT of the representative peak (i.e. strongest peak) of a feature
        ind = np.argmax(f[i]["intensity"])
        rt = f[i]["rt"][ind]

        # 5. minRT and maxRT
        minRt = min(f[i]["rt"])
        maxRt = max(f[i]["rt"])

        # Conversion of RT to the unit of second
        if rt.unit_info == "minute":
            rt = rt * 60  # Convert to the unit of second
            minRt = minRt * 60
            maxRt = maxRt * 60

        # 6. MS1 scan number of the representative peak of a feature
        ms1 = f[i]["num"][ind]

        # 7. minMS1 and maxMS1
        minMs1 = min(list(map(int, f[i]["num"])))
        maxMs1 = max(list(map(int, f[i]["num"])))

        # 8. SNratio (signal-to-noise ratio of the feature)
        if ms1 in noise:
            noiseLevel = noise[ms1]
        else:
            noiseLevel = 500
        snRatio = intensity / noiseLevel
        featureIntensityThreshold = noiseLevel * float(params["signal_noise_ratio"])

        if intensity >= featureIntensityThreshold:
            # 9. Percentage of true feature
            pctTF = (maxRt - minRt) / (gMaxRt - gMinRt) * 100
            # Organize features in a structured numpy array form
            if n == 0:
                features = np.array([(mz, intensity, z, rt, minRt, maxRt, ms1, minMs1, maxMs1, snRatio, pctTF, isotope)],
                                    dtype="f8, f8, f8, f8, f8, f8, f8, f8, f8, f8, f8, f8")
                n += 1
            else:
                features = np.append(features,
                                     np.array([(mz, intensity, z, rt, minRt, maxRt, ms1, minMs1, maxMs1, snRatio, pctTF, isotope)],
                                              dtype=features.dtype))
            for j in range(len(f[i]["num"])):
                num = f[i]["num"][j]
                if num not in ms1ToFeatures:
                    ms1ToFeatures[num] = {"mz": [f[i]["mz"][j]],
                                          "intensity": [f[i]["intensity"][j]]}
                else:
                    ms1ToFeatures[num]["mz"].append(f[i]["mz"][j])
                    ms1ToFeatures[num]["intensity"].append(f[i]["intensity"][j])
        else:
            continue

    features.dtype.names = ("mz", "intensity", "z", "RT", "minRT", "maxRT", "MS1", "minMS1", "maxMS1", "SNratio", "PercentageTF", "isotope")

    ##########################
    # Decharging of features #
    ##########################
    features = dechargeFeatures(features)
    # print()

    ############################################
    # Convert the features to pandas dataframe #
    # Write features to a file                 #
    ############################################
    df = pd.DataFrame(features)
    df = df.drop(columns = ["isotope"])    # "isotope" column was internally used, and no need to be transferred

    # Create a subdirectory and save features to a file
    baseFilename = os.path.splitext(os.path.basename(filename))[0]  # i.e. filename without extension
    featureDirectory = os.path.join(os.getcwd(), baseFilename)
    if not os.path.exists(featureDirectory):
        os.mkdir(featureDirectory)

    # # Increment the number of a feature file
    # if len(glob.glob(os.path.join(featureDirectory, baseFilename + ".*.feature"))) == 0:
    #     featureFilename = os.path.splitext(os.path.basename(filename))[0] + ".1.feature"
    # else:
    #     oldNo = 0
    #     for f in glob.glob(os.path.join(featureDirectory, baseFilename + ".*.feature")):
    #         oldNo = max(oldNo, int(os.path.basename(f).split(".")[-2]))
    #     featureFilename = baseFilename + "." + str(int(oldNo) + 1) + ".feature"
    # featureFilename = os.path.join(featureDirectory, featureFilename)

    # Simply overwrite any existing feature file
    # Individual feature file still needs to be located in an input file-specific location
    # since the feature file can be directly used later
    featureFilename = baseFilename + ".feature"
    featureFilename = os.path.join(featureDirectory, featureFilename)
    df.to_csv(featureFilename, index = False, sep = "\t")

    return df  # Pandas DataFrame
Beispiel #13
0
def searchLibrary(full, paramFile):
    ##################################
    # Load parameters and initialize #
    ##################################
    try:
        params = utils.getParams(paramFile)
    except:
        sys.exit("Parameter file cannot be found or cannot be loaded")
    condition = params["LC_column"].lower()
    if params["mode"] == "1":
        condition = condition + "p"
    elif params["mode"] == "-1":
        condition = condition + "n"
    else:
        sys.exit("'mode' parameter should be either 1 or -1")
    proton = 1.007276466812
    matchMzTol = float(params["library_mass_tolerance"])  # Unit of ppm
    adducts = adductDictionary(params)
    nFeatures = full.shape[0]
    # While full["feature_RT"] has the unit of minute, the library compounds have RTs in the unit of second
    # So, within this function, full["feature_RT"] needs to be converted to the unit of second
    full["feature_RT"] = full["feature_RT"] * 60

    ##########################
    # Perform library search #
    ##########################
    allRes = pd.DataFrame()
    nLibs = 1
    for libFile in params["library"]:
        doAlignment = int(params["library_rt_alignment"])
        print("  Library {} is being loaded".format(os.path.basename(libFile)))
        logging.info("  Library {} is being loaded".format(
            os.path.basename(libFile)))
        try:
            conn = sqlite3.connect(libFile)
        except:
            sys.exit("Library file cannot be found or cannot be loaded.")

        #####################################################
        # RT-alignment between features and library entries #
        #####################################################
        # Check whether 'rt' column of the library is numeric value or not
        hasNumericRt = 0
        cursor = conn.execute("PRAGMA table_info(library)")
        pragma = cursor.fetchall()
        for row in pragma:
            if row[1].lower() == "rt":
                if row[2].lower() == "real":
                    hasNumericRt = 1
                break

        # RT-alignment
        if doAlignment == 1:
            if hasNumericRt == 1:
                print(
                    "  RT-alignment is being performed between features and library compounds"
                )
                logging.info(
                    "  RT-alignment is being performed between features and library compounds"
                )
                x, y = prepRtAlignment(full, conn, params)
                mod = rtAlignment(x, y)
                if mod == -1:
                    print(
                        "  Since there are TOO FEW feature RTs comparable to library RTs, RT-alignment is skipped"
                    )
                    logging.info(
                        "  Since there are TOO FEW feature RTs comparable to library RTs, RT-alignment is skipped"
                    )
                    doAlignment = 0
                else:
                    # Calibration of features' RT
                    rPredict = ro.r("predict")
                    full["feature_calibrated_RT"] = None
                    full["feature_calibrated_RT"] = full[
                        "feature_RT"] - rPredict(
                            mod, FloatVector(full["feature_RT"]))
                    # Empirical CDF of alignment (absolute) residuals (will be used to calculate RT shift-based scores)
                    ecdfRt = ECDF(abs(np.array(mod.rx2("residuals"))))
            else:
                print(
                    "  Although the parameter is set to perform RT-alignment against the library, there are no valid RT values in the library"
                )
                print("  Therefore, RT-alignment is not performed")
                logging.info(
                    "  Although the parameter is set to perform RT-alignment against the library, there are no valid RT values in the library"
                )
                logging.info("  Therefore, RT-alignment is not performed")
                doAlignment = 0
        else:
            print(
                "  According to the parameter, RT-alignment is not performed between features and library compounds"
            )
            logging.info(
                "  According to the parameter, RT-alignment is not performed between features and library compounds"
            )

        ########################################
        # Match features and library compounds #
        ########################################
        # Match features and library compounds
        print("  Features are being compared with library compounds")
        logging.info("  Features are being compared with library compounds")
        res = {
            "no": [],
            "feature_index": [],
            "feature_m/z": [],
            "feature_original_RT": [],
            "feature_aligned_RT": [],
            "id": [],
            "other_id": [],
            "formula": [],
            "name": [],
            "ion": [],
            "RT": [],
            "SMILES": [],
            "InchiKey": [],
            "collision_energy": [],
            "RT_shift": [],
            "RT_score": [],
            "MS2_score": [],
            "combined_score": []
        }
        intensityCols = [
            col for col in full.columns if col.lower().endswith("_intensity")
        ]
        for c in intensityCols:
            res[c] = []
        n = 0
        progress = utils.progressBar(nFeatures)
        for i in range(nFeatures):
            progress.increment()
            # Feature information
            fZ = full["feature_z"].iloc[i]
            fSpec = full["MS2"].iloc[i]
            if np.isnan(
                    fZ
            ) or fSpec is None:  # When MS2 spectrum of the feature is not defined, skip it
                continue
            fMz = full["feature_m/z"].iloc[i]
            fRt = full["feature_RT"].iloc[i]
            fIntensity = full[intensityCols].iloc[i]
            if params["mode"] == "1":  # Positive mode
                fMass = fZ * (fMz - proton)
            elif params["mode"] == "-1":  # Negative mode
                fMass = fZ * (fMz + proton)

            # Retrieve library compounds of which neutral masses are similar to feature mass
            df = queryLibrary(fMz, fMass, fZ, conn, adducts, matchMzTol)
            if not df.empty:
                colNameOtherId = df.filter(regex="other_ids").columns[0]
                for j in range(df.shape[0]):
                    # When there is/are library compound(s) matched to the feature,
                    # MS2 of the library compound(s) should be retrieved
                    uid = df["id"].iloc[j]
                    uid = uid.replace("##Decoy_", "")
                    sqlQuery = r"SELECT * FROM {}".format(uid)
                    try:
                        libSpec = pd.read_sql_query(sqlQuery, conn)
                    except:
                        continue
                    if not libSpec.empty:
                        n += 1
                        # Calculate the score based on MS2 spectrum
                        libSpec = libSpec.to_dict(orient="list")
                        simMs2 = calcMS2Similarity(fSpec, libSpec, params)
                        pMs2 = 1 - simMs2  # p-value-like score (the smaller, the better)
                        pMs2 = max(np.finfo(float).eps,
                                   pMs2)  # Prevent the underflow caused by 0

                        # Calculate the (similarity?) score based on RT-shift
                        if doAlignment == 1:
                            fAlignedRt = full["feature_calibrated_RT"].iloc[i]
                            rtShift = fAlignedRt - df["rt"].iloc[j]
                            pRt = ecdfRt(
                                abs(rtShift)
                            )  # Also, p-value-like score (the smaller, the better)
                            pRt = max(np.finfo(float).eps, pRt)
                            simRt = 1 - pRt
                            # p = 1 / (0.5 / pMS2 + 0.5 / pRt)  # Combined p-value using harmonic mean with equal weights
                            p = 1 - stats.chi2.cdf(
                                -2 * (np.log(pMs2) + np.log(pRt)),
                                4)  # Fisher's method
                            # p = -2 * (np.log(pMs2) + np.log(pRt))   # Fisher's method used in Perl pipeline (the smaller, the better)
                        else:
                            fAlignedRt = "NA"
                            if hasNumericRt == 1 and df["rt"].iloc[
                                    j] is not None:
                                rtShift = fRt - df["rt"].iloc[j]
                            else:
                                rtShift = "NA"
                            # pRt = 1
                            simRt = "NA"
                            p = pMs2

                        # Output
                        libId = df["id"].iloc[j]
                        libOtherId = df[colNameOtherId].iloc[j]
                        libFormula = df["formula"].iloc[j]
                        libName = df["name"].iloc[j]
                        if hasNumericRt == 1:
                            libRt = df["rt"].iloc[j]
                        else:
                            libRt = "NA"
                        libIon = df["ion_type"].iloc[j]
                        libSmiles = df["smiles"].iloc[j]
                        libInchiKey = df["inchikey"].iloc[j]
                        libEnergy = df["collision_energy"].iloc[j]

                        res["no"].append(n)
                        res["feature_index"].append(i + 1)
                        res["feature_m/z"].append(fMz)
                        res["feature_original_RT"].append(
                            fRt / 60)  # For output, the unit of RT is minute
                        if doAlignment == 1:
                            res["feature_aligned_RT"].append(fAlignedRt / 60)
                        else:
                            res["feature_aligned_RT"].append(fAlignedRt)
                        for c in intensityCols:
                            res[c].append(fIntensity[c])
                        res["id"].append(libId)
                        res["other_id"].append(libOtherId)
                        res["formula"].append(libFormula)
                        res["name"].append(libName)
                        res["ion"].append(libIon)
                        if hasNumericRt == 1:
                            res["RT"].append(libRt / 60)
                        else:
                            res["RT"].append(libRt)
                        res["SMILES"].append(libSmiles)
                        res["InchiKey"].append(libInchiKey)
                        res["collision_energy"].append(libEnergy)
                        if rtShift != "NA":
                            rtShift = abs(rtShift) / 60  # Convert to "minute"
                        res["RT_shift"].append(rtShift)
                        # Haiyan's preference
                        # RT_score and MS2_score: 0 ~ 1 (bad to good)
                        res["RT_score"].append(simRt)
                        res["MS2_score"].append(simMs2)
                        res["combined_score"].append(abs(-np.log10(p)))

        conn.close()
        res = pd.DataFrame.from_dict(res)
        resCols = ["no", "feature_index", "feature_m/z", "feature_original_RT", "feature_aligned_RT"] + intensityCols + \
                  ["id", "other_id", "formula", "name", "ion", "RT", "SMILES", "InchiKey", "collision_energy", "RT_shift",
                   "RT_score", "MS2_score", "combined_score"]
        res = res[resCols]
        res = res.rename(columns={"other_id": colNameOtherId})

        filePath = os.path.join(os.getcwd(), "align_" + params["output_name"])
        outputFile = os.path.join(
            filePath, "align_" + params["output_name"] + "." + str(nLibs) +
            ".library_matches")
        res.to_csv(outputFile, sep="\t", index=False)
        allRes = allRes.append(res, ignore_index=True)
        nLibs += 1

    # RT unit of "full" needs to be converted back to minute for subsequent procedures (i.e. database search)
    full["feature_RT"] = full["feature_RT"] / 60

    return allRes
Beispiel #14
0
    nmt.eval()
    print('Model perplexity: ',
          perplexity(nmt, sourceTest, targetTest, batchSize))

if len(sys.argv) > 3 and sys.argv[1] == 'translate':
    (sourceWord2ind,
     targetWord2ind) = pickle.load(open(wordsDataFileName, 'rb'))

    sourceTest = utils.readCorpus(sys.argv[2])

    nmt = model.NMTmodel(embedding_size, hidden_size, targetWord2ind,
                         sourceWord2ind, startToken, padToken, unkToken,
                         endToken).to(device)
    nmt.load(modelFileName)

    nmt.eval()
    file = open(sys.argv[3], 'w')
    pb = utils.progressBar()
    pb.start(len(sourceTest))
    for s in sourceTest:
        file.write(' '.join(nmt.translateSentence(s)) + "\n")
        pb.tick()
    pb.stop()

if len(sys.argv) > 3 and sys.argv[1] == 'bleu':
    ref = [[s] for s in utils.readCorpus(sys.argv[2])]
    hyp = utils.readCorpus(sys.argv[3])

    bleu_score = corpus_bleu(ref, hyp)
    print('Corpus BLEU: ', (bleu_score * 100))
Beispiel #15
0
def ms2ForFeatures(full, mzxmlFiles, paramFile):
    print("  Identification of MS2 spectra for the features")
    print("  ==============================================")
    logging.info("  Identification of MS2 spectra for the features")
    logging.info("  ==============================================")
    full = full.to_records(
        index=False
    )  # Change pd.DataFrame to np.RecArray for internal computation (speed issue)

    ######################################
    # Load parameters and initialization #
    ######################################
    params = utils.getParams(paramFile)
    # ppiThreshold = "max"  # Hard-coded
    ppiThreshold = params["ppi_threshold_of_features"]
    pctTfThreshold = float(params["max_percentage_RT_range"])
    tolIsolation = float(params["isolation_window"])
    tolPrecursor = float(params["tol_precursor"])
    tolIntraMS2Consolidation = float(params["tol_intra_ms2_consolidation"])
    tolInterMS2Consolidation = float(params["tol_inter_ms2_consolidation"])
    nFeatures = len(full)
    nFiles = len(mzxmlFiles)
    featureToScan = np.empty((nFeatures, nFiles), dtype=object)
    featureToSpec = np.empty((nFeatures, nFiles), dtype=object)

    #################################################
    # Assignment of MS2 spectra to features         #
    # Consolidation of MS2 spectra for each feature #
    #################################################
    m = -1  # Index for input files
    for file in mzxmlFiles:
        m += 1
        reader = mzxml.MzXML(file)
        fileBasename, _ = os.path.splitext(os.path.basename(file))
        colNames = [
            item for item in full.dtype.names
            if item.startswith(fileBasename + "_")
        ]
        subset = full[colNames]
        subset.dtype.names = [s.split("_")[-1] for s in subset.dtype.names]
        ms2Dict = {}
        minScan, maxScan = int(np.nanmin(subset["minMS1"])), int(
            np.nanmax(subset["maxMS1"]))
        progress = utils.progressBar(maxScan - minScan + 1)
        print("  %s is being processed" % os.path.basename(file))
        print("  Looking for MS2 scan(s) responsible for each feature")
        logging.info("  %s is being processed" % os.path.basename(file))
        logging.info("  Looking for MS2 scan(s) responsible for each feature")
        for i in range(minScan, maxScan + 1):
            progress.increment()
            spec = reader[str(i)]
            msLevel = spec["msLevel"]
            if msLevel == 1:
                surveyNum = i
            elif msLevel == 2:
                # Find MS2 scans which satisfy the following conditions

                # From the discussion around June 2020,
                # 1. In ReAdW-derived mzXML files, precursor m/z values are in two tags: "precursorMz" and "filterLine"
                # 2. Through Haiyan's manual inspection, the real precursor m/z value is closer to one in "filterLine" tag
                # 3. So, in this script, precursor m/z of MS2 scan is obtained from "filterLine" tag
                # 4. Note that it may be specific to ReAdW-derived mzXML files since MSConvert-derived mzXML files do not have "filterLine" tag
                # 4.1. In this case, maybe the use of mzML (instead of mzXML) would be a solution (to-do later)

                # precMz = spec["precursorMz"][0]["precursorMz"]  # Precursor m/z from "precursorMz" tag
                p = re.search("([0-9.]+)\\@", spec["filterLine"])
                precMz = float(p.group(1))
                survey = reader[str(surveyNum)]
                fInd = np.where((surveyNum >= subset["minMS1"])
                                & (surveyNum <= subset["maxMS1"])
                                & (subset["mz"] >= (precMz - tolIsolation))
                                & (subset["mz"] <= (precMz + tolIsolation)) &
                                (subset["PercentageTF"] <= pctTfThreshold))[0]
                if len(fInd) > 0:
                    ppi = []
                    for i in range(len(fInd)):
                        mz = subset["mz"][fInd[i]]
                        lL = mz - mz * tolPrecursor / 1e6
                        uL = mz + mz * tolPrecursor / 1e6
                        ind = np.where((survey["m/z array"] >= lL)
                                       & (survey["m/z array"] <= uL))[0]
                        if len(ind) > 0:
                            ppi.append(np.max(survey["intensity array"][ind]))
                        else:
                            ppi.append(0)

                    if sum(ppi) == 0:
                        continue
                    ppi = ppi / np.sum(
                        ppi) * 100  # Convert intensities to percentage values
                    if ppiThreshold == "max":
                        fInd = np.array([fInd[np.argmax(ppi)]])
                    else:
                        # ppiThreshold should be a numeric value
                        ppiThreshold = float(ppiThreshold)
                        fInd = fInd[np.where(ppi > ppiThreshold)]
                    if len(fInd
                           ) == 0:  # Last check of candidate feature indexes
                        continue
                    else:
                        # Add this MS2 scan information to ms2Dict
                        ms2Dict[spec["num"]] = {}
                        ms2Dict[spec["num"]]["mz"] = spec["m/z array"]
                        ms2Dict[
                            spec["num"]]["intensity"] = spec["intensity array"]

                        # Mapping between features and MS2 scan numbers
                        for i in range(len(fInd)):
                            if featureToScan[fInd[i], m] is None:
                                featureToScan[fInd[i], m] = spec["num"]
                            else:
                                featureToScan[fInd[i], m] += ";" + spec["num"]

        print(
            "  Merging MS2 spectra for each feature within a run (it may take a while)"
        )
        logging.info(
            "  Merging MS2 spectra for each feature within a run (it may take a while)"
        )
        progress = utils.progressBar(nFeatures)
        for i in range(nFeatures):
            progress.increment()
            if featureToScan[i, m] is not None:
                spec = intraConsolidation(ms2Dict, featureToScan[i, m],
                                          tolIntraMS2Consolidation)
                featureToSpec[i, m] = spec

    print(
        "  Merging MS2 spectra for each feature between runs when there are multiple runs"
    )
    print(
        "  Simplification of MS2 spectrum for each feature by retaining the most strongest 100 peaks"
    )
    logging.info(
        "  Merging MS2 spectra for each feature between runs when there are multiple runs"
    )
    logging.info(
        "  Simplification of MS2 spectrum for each feature by retaining the most strongest 100 peaks"
    )
    specArray = np.array([])
    progress = utils.progressBar(nFeatures)
    for i in range(nFeatures):
        progress.increment()
        if np.sum(featureToSpec[i] == None) == nFiles:
            specArray = np.append(specArray, None)
        else:
            spec = interConsolidation(featureToSpec[i, :],
                                      tolInterMS2Consolidation)
            specArray = np.append(specArray, spec)

    ###############################
    # MS2 processing for features #
    ###############################
    # "specArray" is the list of (consolidated) MS2 spectra
    # specArray[i] is the MS2 spectrum corresponding to the i-th feature
    # If there's no MS2 spectrum, then specArray[i] is None
    df = utils.summarizeFeatures(full, params)
    # Add the mean m/z of feature and its charge state to the beginning of MS2 spectrum (similar to .dta file)
    for i in range(nFeatures):
        if specArray[i] is not None:
            specArray[i]["mz"] = np.insert(specArray[i]["mz"], 0,
                                           df["feature_m/z"].iloc[i])
            specArray[i]["intensity"] = np.insert(specArray[i]["intensity"], 0,
                                                  df["feature_z"].iloc[i])
    df["MS2"] = specArray
    df = df.sort_values(
        by="feature_m/z",
        ignore_index=True)  # Features are sorted by "feature_m/z"
    df.insert(loc=0, column="feature_num", value=df.index + 1)
    # df["feature_num"] = df.index + 1  # Update "feature_num" according to the ascending order of "feature_m/z" (as sorted)

    # Write MS2 spectra to files
    filePath = os.path.join(os.getcwd(), "align_" + params["output_name"])
    ms2Path = os.path.join(filePath, "MS2")
    if not os.path.exists(ms2Path):
        os.mkdir(ms2Path)
    for i in range(df.shape[0]):
        if df["MS2"].iloc[i] is not None:
            fileName = os.path.join(ms2Path, "f" + str(i + 1) + ".MS2")
            dfMS2 = pd.DataFrame.from_dict(df["MS2"].iloc[i])
            dfMS2.to_csv(fileName, index=False, header=False, sep="\t")

    # Save fully-aligned features with their MS2 spectra (i.e. res) for debugging purpose
    # When the pipeline gets mature, this part needs to be removed
    pickle.dump(df,
                open(os.path.join(filePath, ".fully_aligned_feature.pickle"),
                     "wb"))  # Make the file be hidden

    ##########################
    # Handling mzXML file(s) #
    ##########################
    # Move mzXML files to the directory(ies) where individual .feature files are located
    if params["skip_feature_detection"] == "0":
        for file in mzxmlFiles:
            baseFilename = os.path.basename(file)
            featureDirectory = os.path.join(os.getcwd(),
                                            os.path.splitext(baseFilename)[0])
            os.rename(file, os.path.join(featureDirectory, baseFilename))

    return df, featureToScan
Beispiel #16
0
def buildG4BLfield(magDict, gridDict, saveAs=None, FBonly=False, coil=True):
    """Builds a magnetic field of SSU/SSD and prints it out to a .table file in g4blgrid format.

    Args:
        magDict  (dict): Dictionary containing magnet, coil currents and custom fitDict paths.
                         If fitDict paths are not specified it pulls the default ones.
        gridDict (dict): Dictionary containing information about the grid in which to calculate
                         the field over. 
        saveAs   (str):  Name that the user wishes to call the outputted field (no need to supply
                         full path). If None (default value), the magnet name + todays date is used.
        FBonly (bool):   When True: calculate only FB terms.  When False: calculate geofit+FB terms,
                         i.e the full model field is output.
        coil (bool):     When true, the full field is calculated from the coil fit model. If false,
                         the geometrical fit model is used instead.

    Returns:
        Doesn't return anything.  The outputted field is saved at data/MAUS/saveAs.table.

    Todo:
        *The scaleList part could change? May need support so that it can be adjusted by the user    
    
    """
    print 'Calculating field map for magnet:', magDict['magnet']
    print 'With currents:'
    print '\n\t M1  -> %.2f A\n\t M2  -> %.2f A\n\t ECE -> %.2f A\n'%(magDict['M1']['I'], \
                                                                      magDict['M2']['I'], \
                                                                      magDict['CC']['I'])
    if FBonly == False and coil == True:
        coilfit_calc = get_coilfit_class(magDict)
        
    print 'This could take a while...'
    if saveAs == None:
        _date = time.localtime()
        saveAs = '%s_%s%02d%02d.table'%(magDict['magnet'], _date.tm_year, \
                                    _date.tm_mon, _date.tm_mday)

    xNsteps = int((gridDict['x']['end'] + gridDict['x']['step'])/gridDict['x']['step'])
    xARR = np.linspace(gridDict['x']['start'], gridDict['x']['end'], xNsteps)
    
    yNsteps = int((gridDict['y']['end'] + gridDict['y']['step'])/gridDict['y']['step'])
    yARR = np.linspace(gridDict['y']['start'], gridDict['y']['end'], yNsteps)

    zNsteps = int((gridDict['z']['end'] + gridDict['z']['step'])/gridDict['z']['step'])
    zARR = np.linspace(gridDict['z']['start'], gridDict['z']['end'], zNsteps)

    scaleList = [' 1 X [1e3]\n', ' 2 Y [1e3]\n', ' 3 Z [1e3]\n', \
                 ' 4 BX [1e-3]\n', ' 5 BY [1e-3]\n', ' 6 BZ [1e-3]\n', ' 0\n']
    print 'Writing out %d field points'%(xNsteps*yNsteps*zNsteps)
    count = 1

    start_time = time.time()
    with open(os.path.join(utils.maus_field_path, saveAs), 'w') as _output:
        _output.write('\t%d\t%d\t%d\t1\n'%(xNsteps, yNsteps, zNsteps))
        for i in scaleList:
            _output.write(i)

        for _x in xARR:
            for _y in yARR:
                for _z in zARR:
                    if FBonly == True:
                        Bx, By, Bz = appFB.applyFB_grid(magDict, _x, _y, _z, 0, 0, 0)
                    elif FBonly == False:
                        _Bx, _By, _Bz = coilfit_calc.calc_full_field_at_point_xyz(_x, _y, _z)
                        Bx, By, Bz = appFB.applyFB_grid(magDict, _x, _y, _z, _Bx, _By, _Bz)
                    _output.write('{:.3f}\t{:.3f}\t{:.3f}\t{:.8f}\t{:.8f}\t{:.8f}\n'.format( \
                                    _x, _y,_z, Bx, By, Bz))
                    utils.progressBar(count, xNsteps*yNsteps*zNsteps, start_time, time.time())
                    count += 1
                        
    print 'Finished! File can be found at %s'%os.path.join(utils.maus_field_path, saveAs)
Beispiel #17
0
    output_data = []
    print('Fitting functions to clusters...')
    for i, cluster in enumerate(extracted_point_clusters):
        cluster.show_object_fit = show_object_fit
        cluster.show_object_fit_separate = show_object_fit_separate
        cluster.add_header_data(headers)
        cluster.add_background_data(background)
        try:
            params = cluster.fit_curve(function=fit_function,
                                       square_size=square_size)
        except Exception as e:
            continue  # suppress all Exceptions, incorrect fits are discarded
        finally:
            if not show_object_fit and not show_object_fit_separate:
                progressBar(i, len(extracted_point_clusters) - 1)

        if cluster.correct_fit:
            output_data.append(cluster.output_data())

    result = ""
    result += '-' * 150 + '\n'
    result += '{:<15}{:<15}{:<15}{:<15}{:<15}{:<15}{:<15}{:<15}'.format(
        "x", "y", "flux", "fwhm_x|fwhm_y", "peak_SNR", "fit_rms",
        "skew_x|skew_y", "kurt_x|kurt_y") + '\n'
    result += '-' * 150 + '\n'
    for i, data in enumerate(output_data):
        result += '{:<15}{:<15}{:<15}{:<15}{:<15}{:<15}{:<15}{:<15}'.format(
            data[0], data[1], data[2], data[3], data[4], data[5], data[6],
            data[7]) + '\n'
def main(args):

    dataset_name = args.dataset
    model_name = args.model
    n_inner_iter = args.adaptation_steps
    batch_size = args.batch_size
    save_model_file = args.save_model_file
    load_model_file = args.load_model_file
    lower_trial = args.lower_trial
    upper_trial = args.upper_trial
    is_test = args.is_test
    stopping_patience = args.stopping_patience
    epochs = args.epochs
    fast_lr = args.learning_rate
    slow_lr = args.meta_learning_rate
    noise_level = args.noise_level
    noise_type = args.noise_type
    resume = args.resume

    first_order = False
    inner_loop_grad_clip = 20
    task_size = 50
    output_dim = 1

    horizon = 10
    ##test

    meta_info = {
        "POLLUTION": [5, 50, 14],
        "HR": [32, 50, 13],
        "BATTERY": [20, 50, 3]
    }

    assert model_name in ("FCN", "LSTM"), "Model was not correctly specified"
    assert dataset_name in ("POLLUTION", "HR", "BATTERY")

    window_size, task_size, input_dim = meta_info[dataset_name]

    grid = [0., noise_level]
    output_directory = "output/"

    train_data_ML = pickle.load(
        open(
            "../../Data/TRAIN-" + dataset_name + "-W" + str(window_size) +
            "-T" + str(task_size) + "-ML.pickle", "rb"))
    validation_data_ML = pickle.load(
        open(
            "../../Data/VAL-" + dataset_name + "-W" + str(window_size) + "-T" +
            str(task_size) + "-ML.pickle", "rb"))
    test_data_ML = pickle.load(
        open(
            "../../Data/TEST-" + dataset_name + "-W" + str(window_size) +
            "-T" + str(task_size) + "-ML.pickle", "rb"))

    for trial in range(lower_trial, upper_trial):

        output_directory = "../../Models/" + dataset_name + "_" + model_name + "_MAML/" + str(
            trial) + "/"
        save_model_file_ = output_directory + save_model_file
        save_model_file_encoder = output_directory + "encoder_" + save_model_file
        load_model_file_ = output_directory + load_model_file

        try:
            os.mkdir(output_directory)
        except OSError as error:
            print(error)

        with open(output_directory + "/results2.txt", "a+") as f:
            f.write("Learning rate :%f \n" % fast_lr)
            f.write("Meta-learning rate: %f \n" % slow_lr)
            f.write("Adaptation steps: %f \n" % n_inner_iter)
            f.write("\n")

        if model_name == "LSTM":
            model = LSTMModel(batch_size=batch_size,
                              seq_len=window_size,
                              input_dim=input_dim,
                              n_layers=2,
                              hidden_dim=120,
                              output_dim=output_dim)
            model2 = LinearModel(120, 1)
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(model2.parameters()),
                                     lr=slow_lr)
        loss_func = mae
        #loss_func = nn.SmoothL1Loss()

        #torch.backends.cudnn.enabled = False

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        meta_learner = MetaLearner(model2, optimizer, fast_lr, loss_func,
                                   first_order, n_inner_iter,
                                   inner_loop_grad_clip, device)
        model.to(device)

        early_stopping = EarlyStopping(patience=stopping_patience,
                                       model_file=save_model_file_encoder,
                                       verbose=True)
        early_stopping2 = EarlyStopping(patience=stopping_patience,
                                        model_file=save_model_file_,
                                        verbose=True)

        if resume:
            model.load_state_dict(torch.load(save_model_file_encoder))
            model2.load_state_dict(
                torch.load(save_model_file_)["model_state_dict"])

            val_error = test(validation_data_ML, meta_learner, model, device)

            early_stopping(val_error, model)
            early_stopping2(val_error, meta_learner)

        total_tasks, task_size, window_size, input_dim = train_data_ML.x.shape
        accum_mean = 0.0

        for epoch in range(epochs):

            model.zero_grad()
            meta_learner._model.zero_grad()

            #train
            #batch_idx = np.random.randint(0, total_tasks-1, batch_size)

            for batch_idx in range(0, total_tasks - 1, batch_size):

                x_spt, y_spt = train_data_ML[batch_idx:batch_idx + batch_size]
                x_qry, y_qry = train_data_ML[batch_idx + 1:batch_idx + 1 +
                                             batch_size]

                x_spt, y_spt = to_torch(x_spt), to_torch(y_spt)
                x_qry = to_torch(x_qry)
                y_qry = to_torch(y_qry)

                # data augmentation
                epsilon = grid[np.random.randint(0, len(grid))]

                if noise_type == "additive":
                    y_spt = y_spt + epsilon
                    y_qry = y_qry + epsilon
                else:
                    y_spt = y_spt * (1 + epsilon)
                    y_qry = y_qry * (1 + epsilon)

                train_tasks = [
                    Task(model.encoder(x_spt[i]), y_spt[i])
                    for i in range(x_spt.shape[0])
                ]
                val_tasks = [
                    Task(model.encoder(x_qry[i]), y_qry[i])
                    for i in range(x_qry.shape[0])
                ]

                adapted_params = meta_learner.adapt(train_tasks)
                mean_loss = meta_learner.step(adapted_params,
                                              val_tasks,
                                              is_training=True)
                accum_mean += mean_loss.cpu().detach().numpy()

                progressBar(batch_idx, total_tasks, 100)

            print(accum_mean / (batch_idx + 1))

            #test

            val_error = test(validation_data_ML, meta_learner, model, device)
            test_error = test(test_data_ML, meta_learner, model, device)
            print("Epoch:", epoch)
            print("Val error:", val_error)
            print("Test error:", test_error)

            early_stopping(val_error, model)
            early_stopping2(val_error, meta_learner)

            if early_stopping.early_stop:
                print("Early stopping")
                break

        print("hallo")
        model.load_state_dict(torch.load(save_model_file_encoder))
        model2.load_state_dict(
            torch.load(save_model_file_)["model_state_dict"])
        meta_learner = MetaLearner(model2, optimizer, fast_lr, loss_func,
                                   first_order, n_inner_iter,
                                   inner_loop_grad_clip, device)

        validation_error = test(validation_data_ML, meta_learner, model,
                                device)
        test_error = test(test_data_ML, meta_learner, model, device)

        validation_error_h1 = test(validation_data_ML,
                                   meta_learner,
                                   model,
                                   device,
                                   horizon=1)
        test_error_h1 = test(test_data_ML,
                             meta_learner,
                             model,
                             device,
                             horizon=1)

        model.load_state_dict(torch.load(save_model_file_encoder))
        model2.load_state_dict(
            torch.load(save_model_file_)["model_state_dict"])
        meta_learner2 = MetaLearner(model2, optimizer, fast_lr, loss_func,
                                    first_order, 0, inner_loop_grad_clip,
                                    device)

        validation_error_h0 = test(validation_data_ML,
                                   meta_learner2,
                                   model,
                                   device,
                                   horizon=1)
        test_error_h0 = test(test_data_ML,
                             meta_learner2,
                             model,
                             device,
                             horizon=1)

        model.load_state_dict(torch.load(save_model_file_encoder))
        model2.load_state_dict(
            torch.load(save_model_file_)["model_state_dict"])
        meta_learner2 = MetaLearner(model2, optimizer, fast_lr, loss_func,
                                    first_order, n_inner_iter,
                                    inner_loop_grad_clip, device)
        validation_error_mae = test(validation_data_ML, meta_learner2, model,
                                    device)
        test_error_mae = test(test_data_ML, meta_learner2, model, device)
        print("test_error_mae", test_error_mae)

        with open(output_directory + "/results2.txt", "a+") as f:
            f.write("Test error: %f \n" % test_error)
            f.write("Validation error: %f \n" % validation_error)
            f.write("Test error h1: %f \n" % test_error_h1)
            f.write("Validation error h1: %f \n" % validation_error_h1)
            f.write("Test error h0: %f \n" % test_error_h0)
            f.write("Validation error h0: %f \n" % validation_error_h0)
            f.write("Test error mae: %f \n" % test_error_mae)
            f.write("Validation error mae: %f \n" % validation_error_mae)

        print(test_error)
        print(validation_error)
Beispiel #19
0
 def train(self, num_trials):
     for trial in xrange(num_trials):
         mr = self._update_weights()
         progressBar(trial, num_trials, info="Mean returns: {}".format(mr))
     progressBar(num_trials, num_trials)
     print "Done"
psms = pd.read_csv(idTxt, skiprows=1,
                   sep=";")  # Note that ID.txt file is delimited by semicolon
psms = psms[["Peptide", "Outfile", "measuredMH", "XCorr"]]
psms = psms.loc[psms["Outfile"].str.contains(
    mzXMLBaseName)]  # Extract PSMs from FTLD_Batch2_F50.mzXML
psms["charge"] = [
    outfile.split("/")[-1].split(".")[-2] for outfile in psms["Outfile"]
]
psms = psms.drop_duplicates()
print("  PSM information has been parsed\n")

# Unique key is peptide-charge pair
keys = psms["Peptide"] + "_" + psms["charge"]
keys = list(set(keys))
res = []
progress = utils.progressBar(len(keys))
for key in keys:
    progress.increment()
    pep, z = key.split("_")
    rtArray = np.array([])
    intArray = np.array([])
    for _, psm in psms[(psms["Peptide"] == pep)
                       & (psms["charge"] == z)].iterrows():
        [_, psmScanNum, _, _, _] = os.path.basename(psm["Outfile"]).split(".")
        psmScanNum = int(psmScanNum)
        surveyScanNum = ms2ToSurvey[psmScanNum]
        _, precIntensity, precRt = getPrecursorPeak(reader, int(psmScanNum),
                                                    surveyScanNum, params)
        rtArray = np.append(rtArray, precRt)
        intArray = np.append(intArray, precIntensity)
    rt = sum(rtArray * intArray) / sum(intArray) / 60