Example #1
0
    def getSlot(self, record):
        if not keyExists('slotDate', record):
            myprint("Skipping appointment booking: SlotDate is empty")
            return None
        if not keyExists('slotTimeFrom', record):
            myprint("Skipping appointment booking: slotTimeFrom is empty")
            return None
        if not keyExists('slotTimeTo', record):
            myprint("Skipping appointment booking: slotTimeTo is empty")
            return None
        if not keyExists(self.input_to_idschema['registrationCenter']['final'],
                         record):
            myprint(
                "Skipping appointment booking: registrationCenter is empty")
            return None
        if len(record[self.input_to_idschema['registrationCenter']
                      ['final']]) == 0:
            myprint(
                "Skipping appointment booking: registrationCenter is empty")
            return None

        return BookingSlot(
            record['slotDate'], record['slotTimeFrom'], record['slotTimeTo'],
            record[self.input_to_idschema['registrationCenter']['final']],
            record['arn'])
Example #2
0
def audit(e, args):

    read_audit_spec(e, args)
    initialize_audit(e)
    saved_state.write_initial_saved_state(e)
    show_audit_spec(e)

    utils.myprint("====== Audit ======")

    while True:
        stage_time = utils.datetime_string()
        if stage_time > e.max_stage_time:
            break
        audit_stage(e, stage_time)
        if stop_audit(e):
            break
        planner.compute_plan(e)

        print("Slack:", risk_bayes.compute_slack_p(e))
        mid = e.mids[0]
        risk_bayes.tweak_all(e, mid)

        if args.pause and not input(
                "Begin new audit stage? (y or n):").startswith('y'):
            break
        saved_state.write_intermediate_saved_state(e)
        time.sleep(2)  # to ensure next stage_time is new
    show_audit_summary(e)
Example #3
0
    def load_gene_data(self):
        gene_file_path = os.path.join(self.data_dir, cs.gene_file)

        df = pd.read_csv(gene_file_path, header=0, index_col=0)

        cutoff = config.get('gene_cutoff')
        self.gene_ids = list(df.axes[0].values[0:cutoff])
        self.patient_ids = list(df.axes[1].values)
        self.data = df.values

        print('Loaded data for {} genes. Examples: {}....'.format(
            len(self.gene_ids), self.gene_ids[0:4]))
        print('Loaded data for {} patients. Examples: {}....'.format(
            len(self.patient_ids), self.patient_ids[0:4]))

        clinical_file_path = os.path.join(self.data_dir, cs.clinical_file)
        self.cdf = pd.read_csv(clinical_file_path, header=0, index_col=0)

        cdf = self.cdf
        print('Loaded clinial data with {} features. Examples: {}....'.format(
            cdf.shape[1], cdf.axes[1].values[0:4]))

        agg = cdf.join(df.T)

        for col, value in cs.dataset_filter.items():
            myprint('Filtering dataset on column \"{}\"" with value {}'.format(
                col, value))
            agg = agg.loc[lambda df: df.get(col) == value, :]

        self.train_set, self.test_set = train_test_split(agg, test_size=0.2, \
                random_state=77, \
                stratify=agg.get('cohort').values)
Example #4
0
 def bookAppointment(self, slot_info: BookingSlot):
     data = {
         "preRegistrationId": slot_info.prid,
         "registration_center_id": slot_info.registration_center,
         "appointment_date": slot_info.slot_date,
         "time_slot_from": slot_info.slot_time_from,
         "time_slot_to": slot_info.slot_time_to
     }
     res = self.api_helper.bookAppointment(data)
     myprint(res)
     return
def moveData(data):
    chunk_id = utils.getName(data).zfill(4)
    utils.myprint('Moving chunck ' + chunk_id)
    utils.mymkdir(TMPDIR + chunk_id)
    utils.mycmd('cp ' + data + ' ' + TMPDIR)
    #   utils.mycmd( 'tar --strip-components=7 -C ' + REF_TMPDIR + chunk_id + ' -xf '+ REF_TMPDIR + '/' + data.split('/')[-1] + ' --exclude \'*.txt\'' )
    utils.mycmd('tar -C ' + TMPDIR + chunk_id + ' -xf ' + TMPDIR +
                data.split('/')[-1])
    utils.mycmd('rm -r ' + TMPDIR + '/' + chunk_id + '/landmarks/')
    utils.mycmd('rm -r ' + TMPDIR + '/' + chunk_id + '/confidence/')
    if os.path.exists(TMPDIR + '/' + data.split('/')[-1]):
        utils.mycmd('rm ' + TMPDIR + '/' + data.split('/')[-1])
Example #6
0
def pretrain_language_model(model, dataloader):
    model.train()
    goptimizer = optim.Adam(model.parameters(), lr=config["generator lr"])
    average_loss = 0
    for (i, batch) in enumerate(dataloader):
        # batch = dataloader.get()
        outputs = model(batch["src_text"], labels=batch["src_text"])
        mleloss = outputs[0]
        average_loss += mleloss.item()
        goptimizer.zero_grad()
        mleloss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        goptimizer.step()
        if (i + 1) % 100 == 0:
            # print("batch: %d, average loss: %.6f, loss: %.6f"%(i+1, average_loss, mleloss.item()))
            myprint("batch: %d, average loss: %.6f, loss: %.6f"%(i+1, average_loss / (i + 1), mleloss.item()))
    model.eval()
    model.cpu()
    torch.save(model.state_dict(), "./%s/result/language_model.pkl"%DATASET)
    model.cuda()
Example #7
0
 def createBookingSlot(self, slot_info: BookingSlot):
     myprint("Checking Slot for ARN: " + slot_info.prid)
     res = self.db_session.checkSlot(slot_info)
     if res is not None:
         if res['available_kiosks'] > res['appointments']:
             myprint("Slot found for ARN: " + slot_info.prid)
         else:
             myprint("Updating slot for ARN, : " + slot_info.prid)
             self.db_session.updateSlot(slot_info, res['appointments'] + 1)
     else:
         myprint("Creating slot for ARN: " + slot_info.prid)
         self.db_session.createSlot(slot_info)
     return
Example #8
0
def show_sample_counts(e):

    utils.myprint(
        "    Total sample counts by Contest.PaperBallotCollection[reported selection]"
        "and actual selection:")
    for cid in e.cids:
        for pbcid in sorted(e.possible_pbcid_c[cid]):
            tally2 = e.sn_tcpra[e.stage_time][cid][pbcid]
            for r in sorted(tally2.keys()):  # r = reported vote
                utils.myprint("      {}.{}[{}]".format(cid, pbcid, r), end='')
                for a in sorted(tally2[r].keys()):
                    utils.myprint("  {}:{}".format(a, tally2[r][a]), end='')
                utils.myprint("  total:{}".format(
                    e.sn_tcpr[e.stage_time][cid][pbcid][r]))
Example #9
0
def show_audit_stage_header(e):

    utils.myprint("audit stage time", e.stage_time)
    utils.myprint("    New target sample sizes by paper ballot collection:")
    for pbcid in e.pbcids:
        last_s = e.saved_state["sn_tp"][e.saved_state["stage_time"]]
        utils.myprint("      {}: {} (+{})"
                .format(pbcid,
                        e.saved_state["plan_tp"][e.saved_state["stage_time"]][pbcid],
                        e.saved_state["plan_tp"][e.saved_state["stage_time"]][pbcid] - \
                            last_s[pbcid]))
Example #10
0
def show_risks_and_statuses(e):
    """ 
    Show election and contest statuses for current stage. 
    """

    utils.myprint(("    Risk (that reported outcome is wrong)"
                   "and measurement status per mid:"))
    for mid in e.mids:
        utils.myprint(
            "     ", mid, e.cid_m[mid], e.risk_method_m[mid],
            e.sampling_mode_m[mid],
            "Risk={}".format(e.risk_tm[e.stage_time][mid]),
            "(limits {},{})".format(e.risk_limit_m[mid], e.risk_upset_m[mid]),
            e.status_tm[e.stage_time][mid])
    utils.myprint("    Election status:", e.election_status_t[e.stage_time])
def show_election_spec(e):
    utils.myprint("====== Election spec ======")
    utils.myprint("Election name: (e.election_name):")
    utils.myprint("    {}".format(e.election_name))
    utils.myprint("Election dirname (e.election_dirname):")
    utils.myprint("    {}".format(e.election_dirname))
    utils.myprint("Election date (e.election date):")
    utils.myprint("    {}".format(e.election_date))
    utils.myprint("Election URL (e.election_url):")
    utils.myprint("    {}".format(e.election_url))
    utils.myprint("Number of contests:")
    utils.myprint("    {}".format(len(e.cids)))
    utils.myprint(
        "Contest ids with contest type, additional parameters, and write-ins mode"
    )
    utils.myprint("(e.cids, e.contest_type_c, e.params_c, e.write_ins_c):")
    for cid in e.cids:
        utils.myprint("    {} ({}, {} winner(s), {} write-ins)".format(
            cid, e.contest_type_c[cid], e.params_c[cid], e.write_ins_c[cid]))
    utils.myprint("Valid selection ids for each cid (e.selids_c):")
    for cid in e.cids:
        utils.myprint("    {}: ".format(cid), end='')
        utils.myprint(", ".join(sorted(e.selids_c[cid])))
    utils.myprint("Number of paper ballot collections:")
    utils.myprint("    {}".format(len(e.pbcids)))
    utils.myprint(
        "Paper ballot collection ids (e.pbcids), CVR types (e.cvr_type_p), and managers (e.manager_p):"
    )
    for pbcid in sorted(e.pbcids):
        utils.myprint("    {} ({}, Manager:{})".format(pbcid,
                                                       e.cvr_type_p[pbcid],
                                                       e.manager_p[pbcid]))
    utils.myprint("Required pbcids for each cid (e.required_pbcid_c):")
    for cid in e.cids:
        utils.myprint("    {}: ".format(cid), end='')
        utils.myprint(", ".join(sorted(e.required_pbcid_c[cid])))
    utils.myprint("Possible pbcids for each cid (e.possible_pbcid_c):")
    for cid in e.cids:
        utils.myprint("    {}: ".format(cid), end='')
        utils.myprint(", ".join(sorted(e.possible_pbcid_c[cid])))
dataTars_string = '/nfs/isicvlnas01/projects/glaive/expts/00036-anhttran_prepare_poseCNN_COW2/expts/COW_data'
dataTars = glob.glob(dataTars_string + '/*.tar')

precompute_data = '/nfs/isicvlnas01/projects/glaive/expts/00036-anhttran_clean_MsCeleb_poseCNN/expts/final_data/'
TMPDIR = os.environ['TMPDIR'] + '/'

log_file = '/12_cow_sum_nomean_dim2048_log.training'


def myprint(sometext, printON=False):
    if printON:
        print sometext
        sys.stdout.flush()


utils.myprint('Working in ' + TMPDIR)


def moveData(data):
    chunk_id = utils.getName(data).zfill(4)
    utils.myprint('Moving chunck ' + chunk_id)
    utils.mymkdir(TMPDIR + chunk_id)
    utils.mycmd('cp ' + data + ' ' + TMPDIR)
    #   utils.mycmd( 'tar --strip-components=7 -C ' + REF_TMPDIR + chunk_id + ' -xf '+ REF_TMPDIR + '/' + data.split('/')[-1] + ' --exclude \'*.txt\'' )
    utils.mycmd('tar -C ' + TMPDIR + chunk_id + ' -xf ' + TMPDIR +
                data.split('/')[-1])
    utils.mycmd('rm -r ' + TMPDIR + '/' + chunk_id + '/landmarks/')
    utils.mycmd('rm -r ' + TMPDIR + '/' + chunk_id + '/confidence/')
    if os.path.exists(TMPDIR + '/' + data.split('/')[-1]):
        utils.mycmd('rm ' + TMPDIR + '/' + data.split('/')[-1])
Example #13
0
    def createApplication(self, record):
        data = {}
        data['IDSchemaVersion'] = '0.1'
        # name
        data[self.input_to_idschema['firstName']
             ['final']] = self.getSimpleType(
                 record[self.input_to_idschema['firstName']['final']])
        data[self.input_to_idschema['lastName']['final']] = self.getSimpleType(
            record[self.input_to_idschema['lastName']['final']])
        data[self.input_to_idschema['middleName']
             ['final']] = self.getSimpleType(
                 record[self.input_to_idschema['middleName']['final']])
        data[self.input_to_idschema['suffix']['final']] = self.getSimpleType(
            record[self.input_to_idschema['suffix']['final']])

        # personal
        if keyExists(self.input_to_idschema['gender']['final'], record) and \
                record[self.input_to_idschema['gender']['final']]:
            data[self.input_to_idschema['gender']
                 ['final']] = self.getSimpleType(
                     record[self.input_to_idschema['gender']['final']])

        if keyExists(self.input_to_idschema['dateOfBirth']['final'], record) and \
                record[self.input_to_idschema['dateOfBirth']['final']]:
            data[self.input_to_idschema['dateOfBirth']['final']] = record[
                self.input_to_idschema['dateOfBirth']['final']]

        if keyExists(self.input_to_idschema['bloodType']['final'], record) and \
                record[self.input_to_idschema['bloodType']['final']]:
            data[self.input_to_idschema['bloodType']
                 ['final']] = self.getSimpleType(
                     record[self.input_to_idschema['bloodType']['final']])

        if keyExists(self.input_to_idschema['residenceStatus']['final'], record) and \
                record[self.input_to_idschema['residenceStatus']['final']]:
            data[self.input_to_idschema['residenceStatus']
                 ['final']] = self.getSimpleType(record[
                     self.input_to_idschema['residenceStatus']['final']])

        if keyExists(self.input_to_idschema['maritalStatus']['final'], record) and \
                record[self.input_to_idschema['maritalStatus']['final']]:
            data[self.input_to_idschema['maritalStatus']
                 ['final']] = self.getSimpleType(
                     record[self.input_to_idschema['maritalStatus']['final']])

        if keyExists(self.input_to_idschema['phone']['final'], record) and \
                record[self.input_to_idschema['phone']['final']]:
            data[self.input_to_idschema['phone']['final']] = record[
                self.input_to_idschema['phone']['final']]

        if keyExists(self.input_to_idschema['email']['final'], record) and \
                record[self.input_to_idschema['email']['final']]:
            data[self.input_to_idschema['email']['final']] = record[
                self.input_to_idschema['email']['final']]

        # birth address
        data[self.input_to_idschema['pobCountry']
             ['final']] = self.getSimpleType(
                 record[self.input_to_idschema['pobCountry']['final']])
        data[self.input_to_idschema['pobProvince']
             ['final']] = self.getSimpleType(
                 record[self.input_to_idschema['pobProvince']['final']])
        data[self.input_to_idschema['pobCity']['final']] = self.getSimpleType(
            record[self.input_to_idschema['pobCity']['final']])

        # permanent address
        data[self.input_to_idschema['permanentCountry']
             ['final']] = self.getSimpleType(
                 record[self.input_to_idschema['permanentCountry']['final']])
        data[self.input_to_idschema['permanentProvince']
             ['final']] = self.getSimpleType(
                 record[self.input_to_idschema['permanentProvince']['final']])
        data[self.input_to_idschema['permanentCity']
             ['final']] = self.getSimpleType(
                 record[self.input_to_idschema['permanentCity']['final']])
        data[self.input_to_idschema['permanentBarangay']
             ['final']] = self.getSimpleType(
                 record[self.input_to_idschema['permanentBarangay']['final']])
        data[self.input_to_idschema['permanentAddressLine']
             ['final']] = self.getSimpleType(record[
                 self.input_to_idschema['permanentAddressLine']['final']])

        # present address
        data[self.input_to_idschema['presentCountry']
             ['final']] = self.getSimpleType(
                 record[self.input_to_idschema['presentCountry']['final']])
        data[self.input_to_idschema['presentProvince']
             ['final']] = self.getSimpleType(
                 record[self.input_to_idschema['presentProvince']['final']])
        data[self.input_to_idschema['presentCity']
             ['final']] = self.getSimpleType(
                 record[self.input_to_idschema['presentCity']['final']])
        data[self.input_to_idschema['presentBarangay']
             ['final']] = self.getSimpleType(
                 record[self.input_to_idschema['presentBarangay']['final']])
        data[self.input_to_idschema['presentAddressLine']
             ['final']] = self.getSimpleType(
                 record[self.input_to_idschema['presentAddressLine']['final']])
        res = self.api_helper.addApplication(data)
        prid = res['preRegistrationId']
        myprint("Checking PRID (" + prid + ") matches with ARN (" +
                record['arn'] + ")")
        if prid != record['arn']:
            raise RuntimeError("Inserted records PRID: expected [" +
                               record['arn'] + ", actual [" + prid + "]")
        return res
Example #14
0
    def execute(self):
        myprint("Deleting all existing PRIDs from mosip_kernel.prid", 2)
        self.db_session.deleteUnAssignedPridFromKernel()
        records = get_json_file(abs_transformedRecords_path)
        if records is not None:
            for record in records:
                myprint("Processing record: ARN " + record['arn'], 2)
                application_info = self.db_session.getApplication(
                    record['arn'])
                if application_info is None:
                    myprint("Inserting PRID to kernel: ARN " + record['arn'])
                    self.db_session.insertPridToKernel(record['arn'])
                    myprint("Creating application: ARN " + record['arn'])
                    application = self.createApplication(record)
                    myprint(
                        "Successfully created application: ARN " +
                        record['arn'], 12)
                    app_status = application['statusCode']
                else:
                    app_status = application_info['status_code']
                    myprint(
                        "Skipping application creation as record already exists: ARN "
                        + record['arn'], 11)

                slot_info: BookingSlot = self.getSlot(record)
                if slot_info is not None:
                    appointment_count = self.db_session.appointmentCountByPrid(
                        record['arn'])
                    if appointment_count == 0:
                        self.createBookingSlot(slot_info)
                        if app_status == 'Booked':
                            self.db_session.updateApplicationStatus(
                                record['arn'], 'Pending_Appointment')
                        self.bookAppointment(slot_info)
                    else:
                        myprint(
                            "Skipping appointment creation as appointment already exists: ARN "
                            + record['arn'], 11)
                else:
                    myprint("No appointment found: ARN " + record['arn'], 11)

                # self.db_session.updatePreregCreatedBy(record['arn'], record[self.input_to_idschema['phone']['final']])
        else:
            raise ValueError("Records not found in transformed data")
Example #15
0
def show_audit_spec(e):

    utils.myprint("====== Audit spec ======")

    utils.myprint(
        "Seed for audit pseudorandom number generation (e.audit_seed):")
    utils.myprint("    {}".format(e.audit_seed))

    utils.myprint(
        ("Risk Measurement ids (e.mids) with contest,"
         "method, risk limit, and upset threshold, and sampling mode:"))
    for mid in e.mids:
        utils.myprint("    {}: {}, {}, {}, {}, {}".format(
            mid, e.cid_m[mid], e.risk_method_m[mid], e.risk_limit_m[mid],
            e.risk_upset_m[mid], e.sampling_mode_m[mid]))

    utils.myprint("Max number of ballots audited/day (e.max_audit_rate_p):")
    for pbcid in sorted(e.pbcids):
        utils.myprint("    {}:{}".format(pbcid, e.max_audit_rate_p[pbcid]))

    utils.myprint("Max allowed start time for any stage (e.max_stage_time):")
    utils.myprint("    {}".format(e.max_stage_time))

    utils.myprint("Number of trials used to estimate risk"
                  " in compute_contest_risk (e.n_trials):")
    utils.myprint("    {}".format(e.n_trials))

    utils.myprint(
        "Dirichlet hyperparameter for base case or non-matching reported/actual votes"
    )
    utils.myprint("(e.pseudocount_base):")
    utils.myprint("    {}".format(e.pseudocount_base))
    utils.myprint(
        "Dirichlet hyperparameter for matching reported/actual votes")
    utils.myprint("(e.pseudocount_match):")
    utils.myprint("    {}".format(e.pseudocount_match))
Example #16
0
def main(args):
    # start experiment
    report_step = 100
    manualSeed = ID if args.seed == 0 else args.seed
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    random.seed(manualSeed)
    torch.manual_seed(manualSeed)
    np.random.seed(manualSeed)
    if args.cuda:
        torch.cuda.manual_seed_all(manualSeed)
    string = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
    start_log("./log/%d-"%ID + string + LOG, args.log)
    if args.resume != -1:
        resume(args.resume)
    myprint(args)
    for d in config:
        myprint("%s: %s" % (d, str(config[d])))
    args.batch_size = config["BATCH_SIZE"]

    # load data
    tokenizer = GPT2Tokenizer.from_pretrained("./%s/gpt"%DATASET)
    tokenizer.bos_token = '<BOS>'
    tokenizer.pad_token = "<PAD>"
    print(tokenizer.add_tokens(['<negative>']))
    print(tokenizer.add_tokens(['<positive>']))
    print(tokenizer.add_tokens(['<PAD>']))
    print(tokenizer.add_tokens(['<BOS>']))

    with open("./%s/%s-gpt.train.json"%(DATASET, STYLE_TYPE), "r") as f:
        data = json.load(f)
    dataloader = Dataloader.GPTLoader(data, tokenizer, args.batch_size, args.cuda, shuffle=True, input_maxlen=30)

    with open("./%s/%s-gpt.dev.json"%(DATASET, STYLE_TYPE), "r") as f:
        data = json.load(f)
    dev_data = Dataloader.GPTLoader(data, tokenizer, args.batch_size, args.cuda, shuffle=False)

    with open("./%s/%s-gpt.test.json"%(DATASET, STYLE_TYPE), "r") as f:
        data = json.load(f)
    if DATASET == "imdb":
        test_data = Dataloader.GPTLoader(data, tokenizer, args.batch_size, args.cuda)
    else:
        test_data = Dataloader.GPTRefLoader(data, tokenizer, args.batch_size, args.cuda)

    # build model
    generator = GPT2LMHeadModel.from_pretrained("./%s/gpt"%DATASET)
    generator.resize_token_embeddings(len(tokenizer))
    language_model = GPT2LMHeadModel.from_pretrained("./%s/gpt"%DATASET)
    language_model.resize_token_embeddings(len(tokenizer))
    language_model.load_state_dict(torch.load("./%s/result/language_model.pkl"%DATASET))
    language_model.eval()
    if config["g_dir"] is not None:
        generator.load_state_dict(torch.load(config["g_dir"]))
    discriminator_a = classifier.AdvDisNet(word_num=len(tokenizer))
    if config["a_dir"] is not None:
        discriminator_a.load_state_dict(torch.load(config["a_dir"]))

    discriminator_b = classifier.RNNDisNet(word_num=len(tokenizer), num_layers=1, dropout=0)
    sim_model = torch.load('sim/sim.pt', map_location='cpu')
    state_dict = sim_model['state_dict']
    vocab_words = sim_model['vocab_words']
    sim_args = sim_model['args']
    sim_args.gpu = args.gpuid
    sim_model = WordAveraging(sim_args, vocab_words)
    sim_model.load_state_dict(state_dict, strict=True)
    L = nn.CrossEntropyLoss()
    BL = nn.BCELoss()

    if args.cuda:
        generator = generator.cuda()
        discriminator_a = discriminator_a.cuda()
        discriminator_b = discriminator_b.cuda()
        sim_model = sim_model.cuda()
        L = L.cuda()
        BL = BL.cuda()
        language_model = language_model.cuda()
        if args.critic:
            critic = critic.cuda()

    goptimizer = optim.Adam(generator.parameters(), lr=config["generator lr"])
    if config["goptim_dir"] is not None:
        goptimizer.load_state_dict(torch.load(config["goptim_dir"], map_location=torch.device('cuda', args.gpuid)))
    for param_group in goptimizer.param_groups:
        param_group['lr'] = config["generator lr"]
    doptimizer_a = optim.Adam(discriminator_a.parameters(), lr=config["class lr"])
    doptimizer_b = optim.Adam(discriminator_b.parameters(), lr=config["discriminator lr"])
    if config["aoptim_dir"] is not None:
        doptimizer_a.load_state_dict(torch.load(config["aoptim_dir"], map_location=torch.device('cuda', args.gpuid)))
    for param_group in doptimizer_a.param_groups:
        param_group['lr'] = config["class lr"]

    
    EPOCH = config["EPOCH"]
    GBATCH = config["generator batch"]
    DBATCH = config["discriminator batch"]
    W_M = config["mle weight"]
    W_A = config["adv weight"]
    W_S = config["sim weight"]
    W_C = config["cycle weight"]
    W_L = config["language weight"]
    W_D = config["class weight"]
    GRAD_CLIP = config["grad clip"]
    PRETRAIN_BATCH = 0
    accumulation_step = config["accumulation_step"]

    gloss_all, gloss_mle, gloss_adv, gloss_cycle, gloss_sim, dloss_a, dloss_b, gcnt, dcnt, avg_language_loss, avg_language_score, avg_adv_score, avg_language_diff = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    avg_fake_loss, avg_real_loss, avg_sim_score, avg_critic_loss = 0, 0, 0, 0
    avg_cls_loss, avg_cls_score, gloss_class, avg_real_loss_cls, avg_fake_loss_cls = 0, 0, 0, 0, 0
    best_record = 1000
    if args.log:
        os.mkdir("./cache/%d"%(ID))
        os.mkdir("./cache/%d/best/"%(ID))
    best_gname = "./cache/%d/best/gen.dict" % ID
    best_a_dname = "./cache/%d/best/a_dis.dict" % ID
    best_b_dname = "./cache/%d/best/b_dis.dict" % ID
    best_goname = "./cache/%d/best/genopt.dict" % ID
    best_a_doname = "./cache/%d/best/a_disopt.dict" % ID
    best_b_doname = "./cache/%d/best/b_disopt.dict" % ID

    gscheduler = optim.lr_scheduler.StepLR(goptimizer, step_size=500, gamma=0.5)
    dscheduler = optim.lr_scheduler.StepLR(doptimizer_a, step_size=250, gamma=0.5)
    fine_tune_stage = args.reinforce
    language_loss_fct = nn.CrossEntropyLoss(reduce=False)
    prev_language_score = 0
    print(classifier.classifer_test(discriminator_a, tokenizer, dev_data, args.batch_size))
    one_tensor = torch.ones(1)
    if args.cuda:
        one_tensor = one_tensor.cuda() 
    # pretrain_language_model(language_model, dataloader)
    for i in range(EPOCH):
        # generator training
        generator.train()
        discriminator_a.eval()
        step_cnt = 0
        goptimizer.zero_grad()
        for j in range(GBATCH * accumulation_step):
            # print(gcnt)
            step_cnt += 1
            batch = dataloader.get()
            # reconstruction loss
            rec_text = torch.cat((batch["src_text"], batch["style_tokens"].unsqueeze(1), batch["src_text"]), dim=1)
            outputs = generator(rec_text, labels=rec_text)
            mleloss = outputs[0]
            mleloss_ = F.threshold(mleloss, config["mle_threshold"], 0)
            # classifier loss
            transfer_text = torch.cat((batch["src_text"], batch["transfer_tokens"].unsqueeze(1)), dim=1)
            cur_len = transfer_text.size(1)
            _, probs = generate(generator, transfer_text, cur_len=cur_len, max_length=int(cur_len * 2 - 1), pad_token_id=tokenizer.pad_token_id,
             eos_token_ids=tokenizer.eos_token_id, batch_size=args.batch_size)
            probs = F.softmax(probs, dim=2)
            idx_probs, words = torch.max(probs, dim=2)
            style_pred = discriminator_a.approximate(probs, 1 - batch["style"])
            style_pred = torch.squeeze(style_pred, 1)
            class_loss = - torch.log(style_pred + 0.0001).mean()
            # adv loss
            adv_pred = discriminator_b.approximate(probs)
            adv_pred = torch.squeeze(adv_pred, 1)
            advloss = - torch.log(adv_pred + 0.0001).mean()
            # sim loss
            if args.sim:
                wx1, wl1, wm1 = sim_model.torchify_batch([make_example(x, sim_model) for x in batch["tokens"]])
                words_ = words.cpu().data.numpy().tolist()
                generate_sents = [tokenizer.decode(evaluate.clean(sent, tokenizer), skip_special_tokens=True, clean_up_tokenization_spaces=False).replace("' ", "'").lstrip() for sent in words_]
                wx2, wl2, wm2 = sim_model.torchify_batch([make_example(x, sim_model) for x in generate_sents])
                with torch.no_grad():
                    sim_scores = sim_model.scoring_function(wx1, wm1, wl1, wx2, wm2, wl2)
                avg_sim_score += sim_scores.mean().item()
                if args.length_penalty:
                    length_penalty = compute_length_penalty(wl1, wl2, 0.25)
                else:
                    length_penalty = 1
                simloss = torch.mul(- torch.mul(sim_scores, length_penalty), torch.log(idx_probs).mean(dim=1)).mean()
            else:
                simloss = torch.zeros(1).cuda()

            # language fluency loss
            with torch.no_grad():
                outputs = language_model(words)
                true_outputs = language_model(batch["src_text"])
            lm_logits = outputs[0]
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = words[..., 1:].contiguous()
            language_loss = language_loss_fct(shift_logits.transpose(1, 2), shift_labels)
            lengths = torch.LongTensor([evaluate.get_len(x, tokenizer) for x in words_]) - 1
            lengths = lengths.cuda() if args.cuda else lengths
            mask = get_mask(lengths, language_loss.size(1))
            if config["sentence_level"]:
                language_loss = torch.mul(mask, language_loss).sum(1) / (lengths.float() + 0.001)
                true_lm_logits = true_outputs[0]
                true_shift_logits = true_lm_logits[..., :-1, :].contiguous()
                true_shift_labels = batch["src_text"][..., 1:].contiguous()
                true_language_loss = language_loss_fct(true_shift_logits.transpose(1, 2), true_shift_labels)
                true_lengths = batch["length"] - 1
                true_mask = get_mask(true_lengths, true_language_loss.size(1))
                true_language_loss = torch.mul(true_mask, true_language_loss).sum(1) / (true_lengths.float() + 0.001)
                avg_language_diff += (language_loss.mean() - true_language_loss.mean()).item()
            now_language_score = language_loss.mean().item()
            if config["sentence_level"]:
                language_loss = torch.mul(language_loss - true_language_loss, torch.mul(mask, torch.log(idx_probs[:, 1:])).sum(1) / (lengths.float() + 0.001)).mean()
            else:
                language_loss = (torch.mul(torch.mul(language_loss, torch.log(idx_probs[:, 1:])), mask).sum(1) / (lengths.float() + 0.001)).mean()
            avg_language_loss += language_loss.item()
            avg_language_score += now_language_score

            # compute loss
            if gcnt < PRETRAIN_BATCH:
                loss = W_M * mleloss_
            else:
                loss = W_M * mleloss_ + W_A * advloss + W_S * simloss + W_L * language_loss + W_D * class_loss
            gloss_all += loss.item() / accumulation_step
            gloss_mle += mleloss.item()
            gloss_adv += advloss.item()
            gloss_sim += simloss.item()
            gloss_class += class_loss.item()
            now_advloss = advloss.item()
            now_simloss = simloss.item()
            now_loss = loss.item()
            now_mleloss = mleloss.item()
            loss = loss / accumulation_step # normalizing
            loss.backward()
            if step_cnt % accumulation_step == 0:
                gcnt += 1
                step_cnt = 0
                nn.utils.clip_grad_norm_(generator.parameters(), GRAD_CLIP)
                goptimizer.step()
                goptimizer.zero_grad()
                if W_L < config["max_language_weight"]:
                    # adjusting weights
                    W_L += 1
                del advloss, mleloss, mleloss_, loss, simloss
                torch.cuda.empty_cache()
        # discriminator training
        discriminator_b.train()
        discriminator_a.train()
        generator.eval()
        doptimizer_a.zero_grad()
        doptimizer_b.zero_grad()
        for j in range(DBATCH):
            if gcnt < PRETRAIN_BATCH:
                break
            batch = dataloader.get()
            transfer_text = torch.cat((batch["src_text"], batch["transfer_tokens"].unsqueeze(1)), dim=1)
            cur_len = transfer_text.size(1)
            with torch.no_grad():
            	_, probs = generate(generator, transfer_text, cur_len=cur_len, max_length=int(cur_len * 2 - 1), pad_token_id=tokenizer.pad_token_id,
            	 eos_token_ids=tokenizer.eos_token_id, batch_size=args.batch_size)
            	probs = F.softmax(probs, dim=2)
            	probs.detach_()
            # discriminator for naturalness
            if args.reinforce:
                probs, words = torch.max(probs, dim=2)
                style_pred = discriminator_b(words)
            else:
                style_pred = discriminator_b.approximate(probs)
            style_pred = torch.squeeze(style_pred, 1)
            real_style_pred_true = discriminator_b(batch["src_text"])
            real_style_pred_ture = torch.squeeze(real_style_pred_true, 1)
            fake_loss_b = - torch.log(1 - style_pred).mean()
            real_loss_b = - torch.log(real_style_pred_true).mean()
            advloss_b = real_loss_b + fake_loss_b
            avg_fake_loss += fake_loss_b.item()
            avg_real_loss += real_loss_b.item()
            now_fake_loss = fake_loss_b.item()
            now_real_loss = real_loss_b.item()
            now_dis_loss = advloss_b.item()
            dloss_b += advloss_b.item()
            doptimizer_b.zero_grad()
            advloss_b.backward()
            nn.utils.clip_grad_norm_(discriminator_b.parameters(), GRAD_CLIP)
            doptimizer_b.step()
            # discriminator for style
            if args.update_style:
                if args.reinforce:
                    style_pred = discriminator_a(words, 1 - batch["style"])
                else:
                    style_pred = discriminator_a.approximate(probs, 1 - batch["style"])
                style_pred = torch.squeeze(style_pred, 1)
                real_style_pred_true = discriminator_a(batch["src_text"], batch["style"])
                real_style_pred_ture = torch.squeeze(real_style_pred_true, 1)
                fake_loss_a = - torch.log(1 - style_pred).mean()
                real_loss_a = - torch.log(real_style_pred_true).mean()
                advloss_a = real_loss_a + fake_loss_a
                avg_fake_loss_cls += fake_loss_a.item()
                avg_real_loss_cls += real_loss_a.item()
                dloss_a += advloss_a.item()
                doptimizer_a.zero_grad()
                advloss_a.backward()
                nn.utils.clip_grad_norm_(discriminator_a.parameters(), GRAD_CLIP)
                doptimizer_a.step()
            else:
                real_loss_a = 0
                fake_loss_a = 0
                advloss_a = 0
            dcnt += 1
            del real_loss_b, fake_loss_b, advloss_b, real_loss_a, fake_loss_a, advloss_a
            torch.cuda.empty_cache()

        if gcnt % report_step == 0:
            myprint("task id: %d"%ID)
            myprint("generator training batch: %d"%gcnt)
            myprint("average loss: %.6f"%(gloss_all / report_step))
            myprint("average adv loss: %.6f"%(gloss_adv / (report_step * accumulation_step)))
            myprint("average mle loss: %.6f"%(gloss_mle / (report_step * accumulation_step)))
            myprint("average cycle loss: %.6f"%(gloss_cycle / (report_step * accumulation_step)))
            myprint("average sim loss: %.6f"%(gloss_sim / (report_step * accumulation_step)))
            myprint("average sim score: %.6f"%(avg_sim_score / (report_step * accumulation_step)))
            myprint("avg class loss: %.6f"%(gloss_class / (report_step * accumulation_step)))
            myprint("avg class score: %.6f"%(avg_cls_score / (report_step * accumulation_step)))
            myprint("avg language score: %.6f"%(avg_language_score  / (report_step * accumulation_step)))
            myprint("avg language loss: %.6f"%(avg_language_loss  / (report_step * accumulation_step)))
            if config["sentence_level"]:
                myprint("avg language diff: %.6f"%(avg_language_diff  / (report_step * accumulation_step)))
            myprint("avg adv score: %.6f"%(avg_adv_score / (report_step * accumulation_step)))
            avg_language_loss, avg_language_score, avg_adv_score, avg_language_diff = 0, 0, 0, 0
            myprint()
            gloss_all, gloss_mle, gloss_adv, gloss_cycle, gloss_sim, avg_sim_score, gloss_class, avg_cls_score = 0, 0, 0, 0, 0, 0, 0, 0

        if dcnt % report_step == 0 and dcnt != 0:
            myprint("discriminator training batch: %d"%dcnt)
            myprint("b average loss: %.6f"%(dloss_b / (report_step)))
            myprint("avg real loss: %.6f"%(avg_real_loss/(report_step)))
            myprint("avg fake loss: %.6f"%(avg_fake_loss/(report_step)))
            myprint("a average loss: %.6f"%(dloss_a / (report_step)))
            myprint("avg real cls loss: %.6f"%(avg_real_loss_cls/(report_step)))
            myprint("avg fake cls loss: %.6f"%(avg_fake_loss_cls/(report_step)))
            myprint()
            dloss_a, dloss_b, avg_real_loss, avg_fake_loss, avg_real_loss_cls, avg_fake_loss_cls = 0, 0, 0, 0, 0, 0

        gscheduler.step()
        dscheduler.step()
        string = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
        gname = "./cache/%d/gen-%s.dict" % (ID, string)
        a_dname = "./cache/%d/a_dis-%s.dict" % (ID, string)
        b_dname = "./cache/%d/b_dis-%s.dict" % (ID, string)
        goname = "./cache/%d/genopt-%s.dict" % (ID, string)
        a_doname = "./cache/%d/a_disopt-%s.dict" % (ID, string)
        b_doname = "./cache/%d/b_disopt-%s.dict" % (ID, string)
        if gcnt % 1000 == 0 and args.log:
            generator.eval()
            result = test(generator, "dev")
            acc_transfer = result["acc"]
            self_bleu = result["self_bleu"]
            dev_acc = acc_transfer
            dev_bleu = self_bleu
            dev_ppl = result["ppl"]
            myprint(f"gcnt: {gcnt}")
            myprint("dev set:")
            myprint("acc transfer: %.6f"%acc_transfer)
            myprint("self_bleu: %.6f"%self_bleu)
            myprint("ppl: %.6f"%dev_ppl)
            result = test(generator, "test")
            acc_transfer = result["acc"]
            self_bleu = result["self_bleu"]
            ppl = result["ppl"]
            myprint("test set:")
            myprint("acc transfer: %.6f"%acc_transfer)
            myprint("self_bleu: %.6f"%self_bleu)
            myprint("ppl: %.6f"%ppl)
            if DATASET != "imdb":
                bleu = result["bleu"]
                myprint("bleu: %.6f"%bleu)
            generator.train()
            generator.cpu()
            discriminator_a.cpu()
            f_score = 2 * dev_acc * dev_bleu / (dev_acc + dev_bleu)
            if dev_ppl < best_record and dev_acc > config["acc_threshold"] and gcnt > PRETRAIN_BATCH:
                best_record = dev_ppl
                myprint("best")
                myprint("acc transfer: %.6f"%acc_transfer)
                myprint("self_bleu: %.6f"%self_bleu)
                myprint("ppl: %.6f"%ppl)
                if DATASET != "imdb":
                    myprint("bleu: %.6f"%bleu)
                myprint()
                torch.save(generator.state_dict(), best_gname)
                torch.save(discriminator_a.state_dict(), best_a_dname)
                torch.save(goptimizer.state_dict(), best_goname)
                torch.save(doptimizer_a.state_dict(), best_a_doname)
            if gcnt > PRETRAIN_BATCH:
                gname = "./cache/%d/gen-%d.dict" % (ID, gcnt)
                a_dname = "./cache/%d/a_dis-%d.dict" % (ID, gcnt)
                torch.save(generator.state_dict(), gname)
                torch.save(discriminator_a.state_dict(), a_dname)
            if args.cuda:
                generator.cuda()
                discriminator_a.cuda()
Example #17
0
def show_reported(e):

    utils.myprint("====== Reported election data ======")

    utils.myprint(
        "Total reported votes for each vote by cid and pbcid (e.rn_cpr):")
    for cid in e.cids:
        for pbcid in sorted(e.possible_pbcid_c[cid]):
            utils.myprint("    {}.{}: ".format(cid, pbcid), end='')
            for vote in sorted(e.rn_cpr[cid][pbcid]):
                utils.myprint("{}:{} ".format(vote,
                                              e.rn_cpr[cid][pbcid][vote]),
                              end='')
            utils.myprint()

    utils.myprint("Total votes cast for each cid (e.rn_c):")
    for cid in e.cids:
        utils.myprint("    {}: {}".format(cid, e.rn_c[cid]))

    utils.myprint("Total cast for each vote for each cid (e.rn_cr):")
    for cid in e.cids:
        utils.myprint("    {}: ".format(cid), end='')
        for vote in sorted(e.rn_cr[cid]):
            utils.myprint("{}:{} ".format(vote, e.rn_cr[cid][vote]), end='')
        utils.myprint()

    utils.myprint("Reported outcome for each cid (e.ro_c):")
    for cid in e.cids:
        utils.myprint("    {}:{}".format(cid, e.ro_c[cid]))
Example #18
0
def show_audit_summary(e):

    utils.myprint("=============")
    utils.myprint("Audit completed!")

    utils.myprint("All measurements have a status in the following list:",
                  e.election_status_t[e.stage_time])
    if all([e.sampling_mode_m[mid]!="Active" \
            or e.status_tm[e.stage_time][mid]!="Open" \
            for mid in e.mids]):
        utils.myprint("No `Active' measurement still has `Open' status.")
    if ("Active", "Upset") in \
       [(e.sampling_mode_m[mid], e.status_tm[e.stage_time][mid])
        for mid in e.mids]:
        utils.myprint(("At least one `Active' measurement signals"
                       " `Upset' (full recount needed)."))
    if e.stage_time > e.max_stage_time:
        utils.myprint("Maximum audit stage time ({}) reached.".format(
            e.max_stage_time))

    utils.myprint("Number of ballots sampled, by paper ballot collection:")
    for pbcid in e.pbcids:
        utils.myprint("  {}:{}".format(pbcid, e.sn_tp[e.stage_time][pbcid]))
    utils.myprint_switches = ["std"]
    utils.myprint("Total number of ballots sampled: ", end='')
    utils.myprint(sum([e.sn_tp[e.stage_time][pbcid] for pbcid in e.pbcids]))
Example #19
0
def main(args):
    # start experiment
    report_step = 100
    manualSeed = ID if args.seed == 0 else args.seed
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    random.seed(manualSeed)
    torch.manual_seed(manualSeed)
    np.random.seed(manualSeed)
    if args.cuda:
        torch.cuda.manual_seed_all(manualSeed)
    string = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
    start_log("./log/%d-" % ID + string + LOG, args.log)
    myprint(args)
    for d in config:
        myprint("%s: %s" % (d, str(config[d])))
    args.batch_size = config["BATCH_SIZE"]

    # load data
    tokenizer = GPT2Tokenizer.from_pretrained("./%s/gpt" % DATASET)
    tokenizer.bos_token = '<BOS>'
    tokenizer.pad_token = "<PAD>"
    print(tokenizer.add_tokens(['<negative>']))
    print(tokenizer.add_tokens(['<positive>']))
    print(tokenizer.add_tokens(['<PAD>']))
    print(tokenizer.add_tokens(['<BOS>']))

    with open("./%s/%s-gpt.train.json" % (DATASET, STYLE_TYPE), "r") as f:
        data = json.load(f)
    dataloader = Dataloader.GPTLoader(data,
                                      tokenizer,
                                      args.batch_size,
                                      args.cuda,
                                      shuffle=True,
                                      input_maxlen=40)

    with open("./%s/%s-gpt.dev.json" % (DATASET, STYLE_TYPE), "r") as f:
        data = json.load(f)
    dev_data = Dataloader.GPTLoader(data,
                                    tokenizer,
                                    args.batch_size,
                                    args.cuda,
                                    shuffle=False)

    with open("./%s/%s-gpt.test.json" % (DATASET, STYLE_TYPE), "r") as f:
        data = json.load(f)
    if DATASET != "imdb":
        test_data = Dataloader.GPTRefLoader(data, tokenizer, args.batch_size,
                                            args.cuda)
    else:
        test_data = Dataloader.GPTLoader(data, tokenizer, args.batch_size,
                                         args.cuda)

    # build model
    generator = GPT2LMHeadModel.from_pretrained("./%s/gpt" % DATASET)
    generator.resize_token_embeddings(len(tokenizer))
    if config["g_dir"] is not None:
        generator.load_state_dict(torch.load(config["g_dir"]))
    discriminator_a = classifier.AdvDisNet(word_num=len(tokenizer))
    if config["a_dir"] is not None:
        discriminator_a.load_state_dict(torch.load(config["a_dir"]))

    L = nn.CrossEntropyLoss()

    if args.cuda:
        generator = generator.cuda()
        discriminator_a = discriminator_a.cuda()
        L = L.cuda()

    goptimizer = optim.Adam(generator.parameters(), lr=config["generator lr"])
    if config["goptim_dir"] is not None:
        goptimizer.load_state_dict(
            torch.load(config["goptim_dir"],
                       map_location=torch.device('cuda', args.gpuid)))
    for param_group in goptimizer.param_groups:
        param_group['lr'] = config["generator lr"]
    doptimizer_a = optim.Adam(discriminator_a.parameters(),
                              lr=config["discriminator lr"])
    if config["aoptim_dir"] is not None:
        doptimizer_a.load_state_dict(
            torch.load(config["aoptim_dir"],
                       map_location=torch.device('cuda', args.gpuid)))
    for param_group in doptimizer_a.param_groups:
        param_group['lr'] = config["discriminator lr"]

    EPOCH = config["EPOCH"]
    GBATCH = config["generator batch"]
    DBATCH = config["discriminator batch"]
    W_M = config["mle weight"]
    W_A = config["adv weight"]
    W_C = config["cycle weight"]
    GRAD_CLIP = config["grad clip"]
    PRETRAIN_BATCH = 1000
    accumulation_step = config["accumulation_step"]

    gloss_all, gloss_mle, gloss_adv, gloss_cycle, dloss_a, gcnt, dcnt, avg_adv_score = 0, 0, 0, 0, 0, 0, 0, 0
    avg_fake_loss, avg_real_loss = 0, 0
    best_record = 0
    if args.log:
        os.mkdir("./cache/%d" % (ID))
        os.mkdir("./cache/%d/best/" % (ID))
    best_gname = "./cache/%d/best/gen.dict" % ID
    best_a_dname = "./cache/%d/best/a_dis.dict" % ID
    best_b_dname = "./cache/%d/best/b_dis.dict" % ID
    best_goname = "./cache/%d/best/genopt.dict" % ID
    best_a_doname = "./cache/%d/best/a_disopt.dict" % ID
    best_b_doname = "./cache/%d/best/b_disopt.dict" % ID

    gscheduler = optim.lr_scheduler.StepLR(goptimizer,
                                           step_size=500,
                                           gamma=0.5)
    dscheduler = optim.lr_scheduler.StepLR(doptimizer_a,
                                           step_size=250,
                                           gamma=0.5)
    for i in range(EPOCH):
        # generator training
        generator.train()
        discriminator_a.eval()
        step_cnt = 0
        goptimizer.zero_grad()
        for j in range(GBATCH * accumulation_step):
            step_cnt += 1
            print(gcnt)
            batch = dataloader.get()
            # reconstruction loss
            rec_text = torch.cat(
                (batch["src_text"], batch["style_tokens"].unsqueeze(1),
                 batch["src_text"]),
                dim=1)
            outputs = generator(rec_text, labels=rec_text)
            mleloss = outputs[0]
            mleloss_ = F.threshold(mleloss, config["mle_threshold"], 0)
            # style classifier loss
            transfer_text = torch.cat(
                (batch["src_text"], batch["transfer_tokens"].unsqueeze(1)),
                dim=1)
            cur_len = transfer_text.size(1)
            _, probs = generate(generator,
                                transfer_text,
                                cur_len=cur_len,
                                max_length=int(cur_len * 2 - 1),
                                pad_token_id=tokenizer.pad_token_id,
                                eos_token_ids=tokenizer.eos_token_id,
                                batch_size=args.batch_size)
            probs = F.softmax(probs, dim=2)
            idx_probs, words = torch.max(probs, dim=2)
            style_pred = discriminator_a.approximate(probs, 1 - batch["style"])
            style_pred = torch.squeeze(style_pred, 1)
            advloss = -torch.log(style_pred + 0.0001).mean()

            # cycle loss
            if args.nocycle:
                cycleloss = torch.zeros(1)
                cycleloss_ = 0
            else:
                cur_len = batch["src_text"].size(1) + 1
                _, probs = generate(generator,
                                    probs,
                                    cur_len=cur_len,
                                    max_length=int(cur_len * 2 - 1),
                                    pad_token_id=tokenizer.pad_token_id,
                                    eos_token_ids=tokenizer.eos_token_id,
                                    batch_size=args.batch_size,
                                    approximate=True,
                                    style_token=batch["style_tokens"])
                probs = probs.transpose(1, 2)
                cycleloss = L(probs, batch["src_text"])
                cycleloss_ = F.threshold(cycleloss, config["cycle_threshold"],
                                         0)

            if gcnt < PRETRAIN_BATCH:
                loss = W_M * mleloss_
            else:
                loss = W_M * mleloss_ + W_A * advloss + W_C * cycleloss_
            gloss_all += loss.item() / accumulation_step
            gloss_mle += mleloss.item()
            gloss_adv += advloss.item()
            gloss_cycle += cycleloss.item()
            loss = loss / accumulation_step  # normalizing
            loss.backward()
            if step_cnt % accumulation_step == 0:
                gcnt += 1
                step_cnt = 0
                nn.utils.clip_grad_norm_(generator.parameters(), GRAD_CLIP)
                goptimizer.step()
                goptimizer.zero_grad()
                del advloss, cycleloss, mleloss, mleloss_, loss, cycleloss_
                torch.cuda.empty_cache()
        # discriminator training
        discriminator_a.train()
        generator.eval()
        doptimizer_a.zero_grad()
        for j in range(DBATCH):
            if gcnt < PRETRAIN_BATCH:
                break
            batch = dataloader.get()
            transfer_text = torch.cat(
                (batch["src_text"], batch["transfer_tokens"].unsqueeze(1)),
                dim=1)
            cur_len = transfer_text.size(1)
            with torch.no_grad():
                _, probs = generate(generator,
                                    transfer_text,
                                    cur_len=cur_len,
                                    max_length=int(cur_len * 2 - 1),
                                    pad_token_id=tokenizer.pad_token_id,
                                    eos_token_ids=tokenizer.eos_token_id,
                                    batch_size=args.batch_size)
                probs = F.softmax(probs, dim=2)
                probs.detach_()
            style_pred = discriminator_a.approximate(probs, 1 - batch["style"])
            style_pred = torch.squeeze(style_pred, 1)
            real_style_pred_true = discriminator_a(batch["src_text"],
                                                   batch["style"])
            real_style_pred_false = discriminator_a(batch["src_text"],
                                                    1 - batch["style"])
            real_style_pred_ture = torch.squeeze(real_style_pred_true, 1)
            real_style_pred_false = torch.squeeze(real_style_pred_false, 1)
            fake_loss_a = -torch.log(1 - style_pred).mean()
            real_loss_a = (-torch.log(real_style_pred_true).mean() -
                           torch.log(1 - real_style_pred_false).mean()) / 2
            advloss_a = real_loss_a + fake_loss_a
            avg_fake_loss += fake_loss_a.item()
            avg_real_loss += real_loss_a.item()
            now_fake_loss = fake_loss_a.item()
            now_real_loss = real_loss_a.item()
            now_dis_loss = advloss_a.item()
            dloss_a += advloss_a.item()
            doptimizer_a.zero_grad()
            advloss_a.backward()
            nn.utils.clip_grad_norm_(discriminator_a.parameters(), GRAD_CLIP)
            doptimizer_a.step()
            dcnt += 1
            del real_loss_a, fake_loss_a, advloss_a
            torch.cuda.empty_cache()

        if gcnt % report_step == 0 and gcnt != 0:
            myprint("task id: %d" % ID)
            myprint("generator training batch: %d" % gcnt)
            myprint("average loss: %.6f" % (gloss_all / report_step))
            myprint("average adv loss: %.6f" %
                    (gloss_adv / (report_step * accumulation_step)))
            myprint("average mle loss: %.6f" %
                    (gloss_mle / (report_step * accumulation_step)))
            myprint("average cycle loss: %.6f" %
                    (gloss_cycle / (report_step * accumulation_step)))
            myprint()
            gloss_all, gloss_mle, gloss_adv, gloss_cycle = 0, 0, 0, 0

        if dcnt % report_step == 0 and dcnt != 0:
            myprint("discriminator training batch: %d" % dcnt)
            myprint("a average loss: %.6f" % (dloss_a / (report_step)))
            myprint("avg real loss: %.6f" % (avg_real_loss / (report_step)))
            myprint("avg fake loss: %.6f" % (avg_fake_loss / (report_step)))
            myprint()
            dloss_a, avg_real_loss, avg_fake_loss = 0, 0, 0

        gscheduler.step()
        dscheduler.step()
        if gcnt % 1000 == 0 and args.log:
            generator.eval()
            result = test(generator, "dev")
            acc_transfer = result["acc"]
            self_bleu = result["self_bleu"]
            dev_acc = acc_transfer
            dev_bleu = self_bleu
            myprint(f"gcnt: {gcnt}")
            myprint("dev set:")
            myprint("acc transfer: %.6f" % acc_transfer)
            myprint("self_bleu: %.6f" % self_bleu)
            result = test(generator, "test")
            acc_transfer = result["acc"]
            self_bleu = result["self_bleu"]
            if DATASET != "imdb":
                bleu = result["bleu"]
            myprint("test set:")
            myprint("acc transfer: %.6f" % acc_transfer)
            myprint("self_bleu: %.6f" % self_bleu)
            if DATASET != "imdb":
                myprint("bleu: %.6f" % bleu)
            generator.train()
            generator.cpu()
            discriminator_a.cpu()
            f_score = 2 * dev_acc * dev_bleu / (dev_acc + dev_bleu)
            if f_score > best_record and gcnt > PRETRAIN_BATCH:
                best_record = f_score
                myprint("best")
                myprint("acc transfer: %.6f" % acc_transfer)
                myprint("self_bleu: %.6f" % self_bleu)
                if DATASET != "imdb":
                    myprint("bleu: %.6f" % bleu)
                myprint()
                torch.save(generator.state_dict(), best_gname)
                torch.save(discriminator_a.state_dict(), best_a_dname)
                torch.save(goptimizer.state_dict(), best_goname)
                torch.save(doptimizer_a.state_dict(), best_a_doname)
            if gcnt > PRETRAIN_BATCH:
                gname = "./cache/%d/gen-%d.dict" % (ID, gcnt)
                a_dname = "./cache/%d/a_dis-%d.dict" % (ID, gcnt)
                torch.save(generator.state_dict(), gname)
                torch.save(discriminator_a.state_dict(), a_dname)
            if args.cuda:
                generator.cuda()
                discriminator_a.cuda()
Example #20
0
    def mystats(self, filename=None, cond=None, ids=None, dataset=None):

        if cond is None or cond is not None and cond(self):
            myprint(
                '-------------Statistics-------------------------------------\n',
                filename)
            if '_time' in self.metrics:
                myprint(
                    'Time lapsed: {} secs.\n'.format(self.metrics['_time']),
                    filename)
            myprint(self.header, filename)

            myprint(
                'Accuracy={d[0]:3.6f} Sensitivity={d[1]:3.3f} Specificity={d[2]:3.3f} f1_score={d[3]:3.3f} my_score={d[4]:3.6f}  \n'
                .format(d=self.conf_based_stats()), filename)
            myprint(
                'Confusion matrix: tn={d[0]:02.4f}, fp={d[1]:02.4f}, fn={d[2]:02.4f}, tp={d[3]:02.4f} \n'
                .format(d=self.cnf_matrix.ravel()), filename)
            myprint(
                'Normalized Confusion matrix: tn={d[0]:02.4f}, fp={d[1]:02.4f}, fn={d[2]:02.4f}, tp={d[3]:02.4f} \n'
                .format(d=self.cnf_matrix_norm.ravel()), filename)

            if ids is not None and dataset is not None:
                utils.do_error(dataset, ids, self.true, self.predictions,
                               filename)
            myprint(
                '------------------------------------------------------------\n',
                filename)