def run(args):
    with open(expand(args.airlines_raw_file), 'r') as f:
        dict_reader = csv.DictReader(f)
        raw_rows = list(dict_reader)
    raw_row_by_tweet_id = {
        int(row['first_tweet_id']): row
        for i, row in enumerate(raw_rows)
    }

    f_in = open(expand(args.airlines_merged_file), 'r')
    f_out = open(expand(args.out_file), 'w')
    dict_reader = csv.DictReader(f_in)
    dict_writer = csv.DictWriter(
        f_out,
        fieldnames=['first_tweet_id', 'tag', 'first_utterance', 'context'])
    dict_writer.writeheader()
    for i, old_merged_row in enumerate(dict_reader):
        if old_merged_row['first_tweet_id'] == '':
            continue
        tweet_id = int(old_merged_row['first_tweet_id'])
        raw_row = raw_row_by_tweet_id[tweet_id]
        raw_row['tag'] = old_merged_row['tag']
        dict_writer.writerow(raw_row)
    # so, we accidentally had a blank line at the end of the non-raw dataset. add that in here...
    dict_writer.writerow({
        'first_tweet_id': '',
        'tag': 'UNK',
        'first_utterance': '',
        'context': ''
    })
    f_out.close()
Example #2
0
File: swarm.py Project: bholt/ipa
def add_keys(args=None, opt=None):
    if 'cass' in opt.containers:
        cons = containers('owl_cass')
    elif 'all' in opt.containers:
        cons = containers('owl_')
    else:
        cons = opt.containers

    for c in cons:
        puts(colored.magenta("[#{c}] ", bold=True) + "add keys")
        ex = swarm_exec(c)
        ex.sh(c='mkdir -p ~/.ssh')
        ex.sh(c='cat > ~/.ssh/id_rsa.pub', _in=open(expand("~/.ssh/id_rsa.pub")))
        ex.sh(c='cat > ~/.ssh/id_rsa; chmod go-r ~/.ssh/id_rsa', _in=open(expand("~/.ssh/id_rsa")))
        ex.sh(c='cat >> ~/.ssh/config', _in=open(expand("~/.ssh/bistromath.config")))
        ex.sh(c='cat >> ~/.bashrc', _in="up(){ pushd /src >/dev/null; rsync -a bistromath:~/sync/owl . --exclude=target/ --exclude=.idea/; popd >/dev/null; };\n")
Example #3
0
def process_movies(dir):
    set_key(API_KEY) #store this key somewhere
    set_locale()

    dir = expand(dir)

    movies = list()
    moviesToDisplay = loadfromfile(dir)

    if not moviesToDisplay:
        moviesToDisplay = list()

    titles = list()

    for m in moviesToDisplay:
        titles.append(m.title)

    explore(dir)

    for f in fileset:
        if f.fulltitle in titles:
            continue

        result = searchMovie(f.fulltitle)

        if len(result) == 0:
            if f.filename not in notfound:
                notfound.append(f.filename)
            print "Couldn't find results for: " + f.fulltitle + " result = " + str(result)
            continue

        # print "**** Found results for: " + f.fulltitle + " result = " + str(result[0])

        movie = result[0] if len(result) > 0 else None
        movies.append(movie)

    for m in movies:
        if not m:
            continue

        if m.id in movieids:
            continue

        p = m.poster

        if p:
            d = DisplayMovie(m,  p.geturl('w154'),)
        else:
            d = DisplayMovie(m, '',)

        if m.id in movieids:
            continue

        pos = bisect.bisect(movieids, m.id)
        movieids.insert(pos, m.id)

        moviesToDisplay.append(d)

    writetofile(dir, moviesToDisplay)
    return sorted(moviesToDisplay, key=lambda x: x.movie.title)
Example #4
0
def create_labeled(args):
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    out_f = open(expand(args.out_labeled), 'w')  # open now to check we can

    clusters, post_ids, post2cluster = get_clusters(
        in_dupes_file=args.in_dupes, in_max_dupes=args.in_max_dupes, max_clusters=args.max_clusters)
    print('cluster sizes:', [len(cluster) for cluster in clusters])
    print('len(clusters)', len(clusters))
    print('len(post_ids) from dupes graph', len(post_ids))

    print('removing post ids which dont have answers...')
    post2answer, answer2post = read_posts_index(in_posts_index=args.in_posts_index)
    post_ids = [id for id in post_ids if id in post2answer]
    print('len(post_ids) after removing no answer', len(post_ids))
    new_clusters = []
    for cluster in clusters:
        cluster = [id for id in cluster if id in post2answer]
        new_clusters.append(cluster)
    clusters = new_clusters
    print('len clusters after removing no answer', [len(cluster) for cluster in clusters])

    post_ids_set = set(post_ids)
    print('len(post_ids_set)', len(post_ids_set))
    answer_ids = [post2answer[id] for id in post_ids]
    print('len(answer_ids)', len(answer_ids))

    preprocessor = Preprocessor(max_len=args.max_len) if not args.no_preprocess else NullPreprocessor()
    post_by_id = load_posts(answer2post=answer2post, post2answer=post2answer, in_posts=args.in_posts)

    count_by_state = defaultdict(int)
    n = 0
    dict_writer = csv.DictWriter(out_f, fieldnames=[
        'id', 'cluster_id', 'question_title', 'question_body', 'answer_body'])
    dict_writer.writeheader()
    for post_id in post_ids:
        if post_id not in post_by_id:
            count_by_state['not in post_by_id'] += 1
            continue
        post = post_by_id[post_id]
        if 'answer_body' not in post or 'question_body' not in post:
            count_by_state['no body, or no answer'] += 1
            continue
        count_by_state['ok'] += 1
        cluster_id = post2cluster[post_id]
        row = {
            'id': post_id,
            'cluster_id': cluster_id,
            'question_title': preprocessor(post['question_title'])[1],
            'question_body': preprocessor(post['question_body'])[1],
            'answer_body': preprocessor(post['answer_body'])[1]
        }
        dict_writer.writerow(row)
        n += 1
    print(count_by_state)
    print('rows written', n)
Example #5
0
def read_posts_index(in_posts_index):
    with open(expand(args.in_posts_index), 'r') as f:
        dict_reader = csv.DictReader(f)
        index_rows = list(dict_reader)
    index_rows = [{'question_id': int(row['question_id']), 'answer_id': int(row['answer_id'])} for row in index_rows]
    post2answer = {row['question_id']: row['answer_id'] for row in index_rows}
    answer2post = {row['answer_id']: row['question_id'] for row in index_rows}
    print('loaded index')
    return post2answer, answer2post
Example #6
0
def run_aggregated(in_csv_file, in_max_rows, out_examples, target_companies,
                   no_preprocess):
    with open(expand(out_examples), 'w') as f_examples:
        examples_writer = csv.DictWriter(
            f_examples,
            fieldnames=['first_tweet_id', 'first_utterance', 'context'])
        examples_writer.writeheader()
        for company in target_companies:
            print(company)
            run_for_company(in_csv_file, in_max_rows, examples_writer, company,
                            no_preprocess)
Example #7
0
def get_clusters(in_dupes_file, in_max_dupes, max_clusters):
    """
    we're reading in the list of pairs of dupes. These are a in format of two post ids per line, like:
        615465 8653
        833376 377050
        30585 120621
        178532 152184
        69455 68850

    When we read these in, we have no idea whether these posts have answers etc. We just read in all
    the pairs of post ids. We are then going to add these post ids to a graph (each post id forms a node),
    and the pairs of post ids become connectsion in the graph. We then form all connected components.

    We sort the connected components by size (reverse order), and take the top num_test_clusters components
    We just ignore the other components
    """
    f_in = open(expand(in_dupes_file), 'r')
    csv_reader = csv.reader(f_in, delimiter=' ')
    G = nx.Graph()
    for n, row in enumerate(csv_reader):
        left = int(row[0])
        right = int(row[1])
        G.add_edge(left, right)
        if in_max_dupes is not None and n >= in_max_dupes:
            print('reached max tao rows => break')
            break
    print('num nodes', len(G))
    print('num clusters', len(list(nx.connected_components(G))))
    count_by_size = defaultdict(int)
    clusters_by_size = defaultdict(list)
    for i, cluster in enumerate(nx.connected_components(G)):
        size = len(cluster)
        count_by_size[size] += 1
        clusters_by_size[size].append(cluster)
    print('count by size:')
    clusters = []
    top_clusters = []
    for size, count in sorted(count_by_size.items(), reverse=True):
        for cluster in clusters_by_size[size]:
            clusters.append(cluster)
            if len(top_clusters) < max_clusters:
                top_clusters.append(cluster)

    print('len(clusters)', len(clusters))
    top_cluster_post_ids = [id for cluster in top_clusters for id in cluster]

    post2cluster = {}
    for cluster_id, cluster in enumerate(clusters):
        for post in cluster:
            post2cluster[post] = cluster_id

    return top_clusters, top_cluster_post_ids, post2cluster
Example #8
0
def mkdir(path):
    """Creates a directory *path*, after performing file path expansion. Does
  nothing if *path* already exists.
  
  Exits with an error message if it cannot create the directory
  """
    path = expand(path)
    if isdir(path):
        return
    try:
        make_directory(path)
    except OSError as e:
        exit(str(e))
Example #9
0
def index_posts(args):
    """
    got through Posts.xml, and find for each post id, the id of the answer. store both in a csv file

    example row:
        <row Id="5" PostTypeId="1" CreationDate="2010-07-28T19:23:40.273" Score="22" ViewCount="581"
        Body="&lt;p&gt;What are some alternatives to upgrading without using the standard upgrade system?
        Suppose for example that I wanted to upgrade an Ubuntu installation on a machine with a poor Internet connection. 
        What would my options be? Could I just use a standard Ubuntu disk to upgrade this machine? If I already have a standard Ubuntu
         disk and want to use that, could I do a clean install without wiping data?&lt;/p&gt;&#xA;"
         OwnerUserId="10" LastEditorUserId="10581" LastEditDate="2014-02-18T13:34:25.793" LastActivityDate="2014-02-18T13:34:25.793"
         Title="What are some alternatives to upgrading without using the standard upgrade system?"
         Tags="&lt;upgrade&gt;&lt;live-cd&gt;&lt;system-installation&gt;"
         AnswerCount="2" CommentCount="1" FavoriteCount="1" />
    """
    in_f = open(expand(args.in_posts), 'r')
    out_f = open(expand(args.out_posts_index), 'w')
    dict_writer = csv.DictWriter(out_f, fieldnames=['question_id', 'answer_id'])
    dict_writer.writeheader()
    last_print = time.time()
    for n, row in enumerate(in_f):
        row = row.strip()
        if not row.startswith('<row'):
            continue
        dom = minidom.parseString(row)
        node = dom.firstChild
        att = node.attributes
        if 'AcceptedAnswerId' in att:
            id = att['Id'].value
            accepted_answer_id = att['AcceptedAnswerId'].value
            dict_writer.writerow({'question_id': id, 'answer_id': accepted_answer_id})
        if args.in_max_posts is not None and n >= args.in_max_posts:
            print('reached max rows => breaking')
            break
        if time.time() - last_print >= 3.0:
            print(n)
            last_print = time.time()
Example #10
0
def load_posts(answer2post, post2answer, in_posts):
    # load in all the posts from Posts.xml
    post_by_id = defaultdict(dict)
    posts_f = open(expand(in_posts), 'r')
    last_print = time.time()
    for n, row in enumerate(posts_f):
        row = row.strip()
        if not row.startswith('<row'):
            continue
        row_id = row.partition(' Id="')[2].partition('"')[0]
        if row_id == '':
            print(row)
            assert row_id != ''
        row_id = int(row_id)
        if row_id in post2answer:
            """
            example row:
            <row Id="5" PostTypeId="1" CreationDate="2010-07-28T19:23:40.273" Score="22" ViewCount="581"
            Body="&lt;p&gt;What are some alternatives to upgrading without using the standard upgrade system?
            Suppose for example that I wanted to upgrade an Ubuntu installation on a machine with a poor Internet connection. 
            What would my options be? Could I just use a standard Ubuntu disk to upgrade this machine? If I already have a standard Ubuntu
             disk and want to use that, could I do a clean install without wiping data?&lt;/p&gt;&#xA;"
             OwnerUserId="10" LastEditorUserId="10581" LastEditDate="2014-02-18T13:34:25.793" LastActivityDate="2014-02-18T13:34:25.793"
             Title="What are some alternatives to upgrading without using the standard upgrade system?"
             Tags="&lt;upgrade&gt;&lt;live-cd&gt;&lt;system-installation&gt;"
             AnswerCount="2" CommentCount="1" FavoriteCount="1" />
            """
            dom = minidom.parseString(row)
            node = dom.firstChild
            att = node.attributes
            assert att['PostTypeId'].value == '1'
            post_by_id[row_id]['question_title'] = att['Title'].value
            post_by_id[row_id]['question_body'] = att['Body'].value
        elif row_id in answer2post:
            dom = minidom.parseString(row)
            node = dom.firstChild
            att = node.attributes
            assert att['PostTypeId'].value == '2'
            post_id = answer2post[row_id]
            post_by_id[post_id]['answer_body'] = att['Body'].value
        if time.time() - last_print >= 3.0:
            print(len(post_by_id))
            last_print = time.time()
        if args.in_max_posts is not None and n > args.in_max_posts:
            print('reached in_max_posts => terminating')
            break
    print('loaded info from Posts.xml')
    return post_by_id
Example #11
0
def create_unlabeled(args):
    """
    this is going to do:
    - take a question (specific type in Posts.xml)
    - match it with the accepted answer (using the index)
    - write these out
    """
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    out_f = open(expand(args.out_unlabeled), 'w')  # open now, to check we can...

    post2answer, answer2post = read_posts_index(in_posts_index=args.in_posts_index)
    print('loaded index')
    print('posts in index', len(post2answer))

    preprocessor = Preprocessor(max_len=args.max_len) if not args.no_preprocess else NullPreprocessor()
    post_by_id = load_posts(post2answer=post2answer, answer2post=answer2post, in_posts=args.in_posts)
    print('loaded all posts, len(post_by_id)', len(post_by_id))

    count_by_state = defaultdict(int)
    n = 0
    dict_writer = csv.DictWriter(out_f, fieldnames=['id', 'question_title', 'question_body', 'answer_body'])
    dict_writer.writeheader()
    last_print = time.time()
    for post_id, info in post_by_id.items():
        if 'answer_body' not in info or 'question_body' not in info:
            count_by_state['no body, or no answer'] += 1
            continue
        count_by_state['ok'] += 1
        dict_writer.writerow({
            'id': post_id,
            'question_title': preprocessor(info['question_title'])[1],
            'question_body': preprocessor(info['question_body'])[1],
            'answer_body': preprocessor(info['answer_body'])[1]
        })
        if time.time() - last_print >= 10:
            print('written', n)
            last_print = time.time()
        n += 1

    print(count_by_state)
Example #12
0
def symlink(src, dst):
    """Given a *src* and a *dst*, creates a symlink fro *src* to *dst*.
  If the given *dst* already exists as a symlink (and is not a directory),
  it will attempt to remove the old symlink and create a new one.

  Exits with an error message if any symlink fails
  """
    link = expand(join("~", dst))
    item = join(getcwd(), src)
    print("{} -> {}".format(item, link))
    if not isdir(item) and islink(link):
        try:
            remove(link)
        except OSError as e:
            exit("Could not remove old symlink {}: {}".format(link, e))
    linker = partial(symbolic_link, item, link)
    if windows:
        linker = partial(linker, target_is_directory=isdir(item))
    try:
        linker()
    except OSError as e:
        exit("Could not symlink {}: {}".format(item, e))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-path',
                        type=str,
                        default='./data/airlines_processed.csv')
    parser.add_argument('--glove-path',
                        type=str,
                        default='./data/glove.840B.300d.txt')
    parser.add_argument('--pre-model',
                        type=str,
                        choices=['ae', 'qt'],
                        default='qt')
    parser.add_argument('--pre-epoch', type=int, default=0)
    parser.add_argument('--pt-batch', type=int, default=100)
    parser.add_argument('--model-path',
                        type=str,
                        help='path of pretrained model to load')
    parser.add_argument('--way', type=int, default=5)
    parser.add_argument('--num-epochs', type=int, default=100)
    parser.add_argument('--seed', type=int, default=0)

    parser.add_argument('--save-model-path', type=str)

    parser.add_argument('--view1-col', type=str, default='view1')
    parser.add_argument('--view2-col', type=str, default='view2')
    parser.add_argument('--label-col', type=str, default='label')
    args = parser.parse_args()

    np.random.seed(args.seed)

    print('loading dataset')
    dataset = Dataset(args.data_path,
                      view1_col=args.view1_col,
                      view2_col=args.view2_col,
                      label_col=args.label_col)
    n_cluster = len(dataset.id_to_label) - 1
    print("num of class = %d" % n_cluster)

    if args.model_path is not None:
        id_to_token, token_to_id, vocab_size, word_emb_size, model = multiview_encoders.load_model(
            args.model_path)
        print('loaded model')
    else:
        id_to_token, token_to_id, vocab_size, word_emb_size, model = \
            multiview_encoders.from_embeddings(
                args.glove_path, dataset.id_to_token, dataset.token_to_id)
        print('created randomly initialized model')
    print('vocab_size', vocab_size)

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    expressions = (model, optimizer)

    pre_acc, pre_state, pre_state_epoch = 0., None, None
    pretrain_method = {
        'ae': pretrain.pretrain_ae,
        'qt': pretrain.pretrain_qt,
    }[args.pre_model]
    for epoch in range(1, args.pre_epoch + 1):
        model.train()
        perm_idx = np.random.permutation(dataset.trn_idx)
        trn_loss, _ = pretrain_method(dataset,
                                      perm_idx,
                                      expressions,
                                      train=True)
        model.eval()
        _, tst_acc = pretrain_method(dataset,
                                     dataset.tst_idx,
                                     expressions,
                                     train=False)
        if tst_acc > pre_acc:
            pre_state = copy.deepcopy(model.state_dict())
            pre_acc = tst_acc
            pre_state_epoch = epoch
        print('{} epoch {}, train_loss={:.4f} test_acc={:.4f}'.format(
            datetime.datetime.now(), epoch, trn_loss, tst_acc))

    if args.pre_epoch > 0:
        # load best state
        model.load_state_dict(pre_state)
        print(f'loaded best state from epoch {pre_state_epoch}')

        # deepcopy pretrained views into v1 and/or view2
        {
            'ae': pretrain.after_pretrain_ae,
            'qt': pretrain.after_pretrain_qt,
        }[args.pre_model](model)

        # reinitialiate optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
        expressions = (model, optimizer)
        print('applied post-pretraining')

    kmeans = sklearn.cluster.KMeans(n_clusters=n_cluster,
                                    max_iter=300,
                                    verbose=0,
                                    random_state=0)
    z_v1, golds = transform(dataset, dataset.trn_idx, model, encoder='v1')
    preds_v1 = kmeans.fit_predict(z_v1)

    lgolds, lpreds = [], []
    for g, p in zip(golds, list(preds_v1)):
        if g > 0:
            lgolds.append(g)
            lpreds.append(p)
    prec, rec, f1 = cluster_metrics.calc_prec_rec_f1(
        gnd_assignments=torch.LongTensor(lgolds).to(device),
        pred_assignments=torch.LongTensor(lpreds).to(device))
    acc = cluster_metrics.calc_ACC(
        torch.LongTensor(lpreds).to(device),
        torch.LongTensor(lgolds).to(device))

    print(
        f'{datetime.datetime.now()} pretrain: test prec={prec:.4f} rec={rec:.4f} '
        f'f1={f1:.4f} acc={acc:.4f}')

    shot, way, query = 5, args.way, 15

    preds_v2 = None
    best_epoch, best_model, best_dev_f1 = None, None, None
    for epoch in range(1, args.num_epochs + 1):
        trn_loss = 0.

        _loss, preds_v2, tst_preds_v2 = run_one_side(model=model,
                                                     optimizer=optimizer,
                                                     preds_left=preds_v1,
                                                     pt_batch=args.pt_batch,
                                                     way=way,
                                                     shot=shot,
                                                     query=query,
                                                     n_cluster=n_cluster,
                                                     dataset=dataset,
                                                     right_encoder_side='v2')
        trn_loss += _loss

        _loss, preds_v1, tst_preds_v1 = run_one_side(model=model,
                                                     optimizer=optimizer,
                                                     preds_left=preds_v2,
                                                     pt_batch=args.pt_batch,
                                                     way=way,
                                                     shot=shot,
                                                     query=query,
                                                     n_cluster=n_cluster,
                                                     dataset=dataset,
                                                     right_encoder_side='v1')
        trn_loss += _loss

        dev_f1 = cluster_metrics.calc_f1(
            gnd_assignments=torch.LongTensor(tst_preds_v1).to(device),
            pred_assignments=torch.LongTensor(tst_preds_v2).to(device))
        dev_acc = cluster_metrics.calc_ACC(
            torch.LongTensor(tst_preds_v2).to(device),
            torch.LongTensor(tst_preds_v1).to(device))

        print('dev view 1 vs view 2: f1={:.4f} acc={:.4f}'.format(
            dev_f1, dev_acc))

        if best_dev_f1 is None or dev_f1 > best_dev_f1:
            print('new best epoch', epoch)
            best_epoch = epoch
            best_dev_f1 = dev_f1
            best_model = copy.deepcopy(model.state_dict())
            best_preds_v1 = preds_v1.copy()
            best_preds_v2 = preds_v2.copy()

        lgolds, lpreds = [], []
        for g, p in zip(golds, list(preds_v1)):
            if g > 0:
                lgolds.append(g)
                lpreds.append(p)
        prec, rec, f1 = cluster_metrics.calc_prec_rec_f1(
            gnd_assignments=torch.LongTensor(lgolds).to(device),
            pred_assignments=torch.LongTensor(lpreds).to(device))
        acc = cluster_metrics.calc_ACC(
            torch.LongTensor(lpreds).to(device),
            torch.LongTensor(lgolds).to(device))

        print(
            f'{datetime.datetime.now()} epoch {epoch}, test prec={prec:.4f} rec={rec:.4f} '
            f'f1={f1:.4f} acc={acc:.4f}')

    print('restoring model for best dev epoch', best_epoch)
    model.load_state_dict(best_model)
    preds_v1, preds_v2 = best_preds_v1, best_preds_v2

    lgolds, lpreds = [], []
    for g, p in zip(golds, list(preds_v1)):
        if g > 0:
            lgolds.append(g)
            lpreds.append(p)
    prec, rec, f1 = cluster_metrics.calc_prec_rec_f1(
        gnd_assignments=torch.LongTensor(lgolds).to(device),
        pred_assignments=torch.LongTensor(lpreds).to(device))
    acc = cluster_metrics.calc_ACC(
        torch.LongTensor(lpreds).to(device),
        torch.LongTensor(lgolds).to(device))
    print(
        f'{datetime.datetime.now()} test prec={prec:.4f} rec={rec:.4f} f1={f1:.4f} acc={acc:.4f}'
    )

    if args.save_model_path is not None:
        preds_v1 = torch.from_numpy(preds_v1)
        if preds_v2 is not None:
            preds_v2 = torch.from_numpy(preds_v2)
        state = {
            'model_state': model.state_dict(),
            'id_to_token': dataset.id_to_token,
            'word_emb_size': word_emb_size,
            'v1_assignments': preds_v1,
            'v2_assignments': preds_v2
        }
        with open(expand(args.save_model_path), 'wb') as f:
            torch.save(state, f)
        print('saved model to ', args.save_model_path)
Example #14
0
import sys
import os
from os import sep
from os.path import join
from os.path import expanduser as expand

script_dir = os.path.dirname(os.path.abspath(__file__))
t_dir = os.path.join(script_dir, "t")
sys.path.append(t_dir)

settings = {}
settings["SESSION_FILE"] = ".pomodoro_session"
settings["SESSION_DIR"] = expand("~/.pomodoro/")

settings["TASK_DIR"] = expand("~/.pomodoro/")
settings["INVENTORY_FILE"] = "tasks"
settings["SCHEDULE_FILE_PREFIX"] = "todo"
settings["CURRENT_TASK_FILE"] = "current"

try:
    with open(expand("~/.pomodororc"), "r") as f:
        user_settings = {}
        for line in f:
            # skip comments and empty lines
            if line.strip().startswith("#") or not line.strip():
                continue
            setting, value = line.split("=")
            setting = setting.strip()
            value = value.strip()
            # expand user directories in paths
            if "DIR" in setting:
Example #15
0
 def search(self,
            iterations,
            call_budget: int = 0,
            time_budget: float = 0) -> Sample:
     assert iterations > 0 or call_budget > 0 or time_budget > 0
     logger.debug(
         f"start training LaMCTS: {iterations} iterations, {call_budget} call budget"
     )
     sample_time = 0.0
     Node.init(self._num_split_workers)
     if self._num_sample_workers > 1 and len(
             self._funcs) == self._num_sample_workers:
         Sampler.init_workers(
             [self._sampler_conf for _ in range(self._num_sample_workers)],
             self._funcs, self._device)
     else:
         Sampler.init_workers([self._sampler_conf], [self._func],
                              self._device)
     start_time = time.time()
     self.init_tree()
     rand = False
     idx = 0
     while True:
         idx += 1
         logger.debug(f"{'=' * 60}")
         logger.debug(f"training iteration: {idx}")
         path = Path([self._root])
         path.expand(rand=rand)
         logger.debug(f"path: {path}")
         tt = time.time()
         call_time = self._func.total_call_time
         samples = Sampler.gen_sample(path, self._num_samples_per_iter)
         logger.debug(
             f"sample size: {len(samples.xs)}, unique size {len(np.unique(samples.xs.astype(dtype=np.int32), axis=0))}"
         )
         # samples = self._sampler.sample(path, self._num_samples_per_iter)
         sample_time += time.time() - tt - (self._func.total_call_time -
                                            call_time)
         if len(samples) == 0:
             logger.debug("no sample found")
             rand = True
             continue
         rand = False
         logger.debug(
             f"best in node: {path[-1].bag.fxs.max()}, best in new samples: {samples.fxs.max()}, "
             f"# of new samples: {len(samples)}")
         self._root.add_bag(samples)
         self._root = Node.split_node(self._root)
         logger.debug(
             f"  number of nodes:   {self._root.num_descendants + 1}")
         logger.debug(f"  number of samples: {len(self._root.bag)}")
         logger.debug(f"  number of calls:   {self._func.total_calls}")
         logger.debug(
             f"  avg leaf size:     {len(self._root.bag) / self._root.num_leaves:.2}"
         )
         logger.debug(f"  best sample:       {self._root.bag.best}")
         if (0 < iterations <= idx
                 or 0 < call_budget <= self._func.total_calls
                 or 0 < time_budget <= time.time() - start_time):
             break
     logger.debug(f"sample time: {sample_time}")
     logger.debug(f"split time:  {Node.split_time}")
     logger.debug(f"call time:   {self._func.total_call_time}")
     Node.cleanup()
     Sampler.close_workers()
     return self._root.bag.best
Example #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-path', type=str, default='./data/airlines_processed.csv')
    parser.add_argument('--glove-path', type=str, default='./data/glove.840B.300d.txt')
    parser.add_argument('--pre-model', type=str, choices=['ae', 'qt'], default='qt')
    parser.add_argument('--pre-epoch', type=int, default=0)
    parser.add_argument('--pt-batch', type=int, default=100)
    parser.add_argument('--model-path', type=str, help='path of pretrained model to load')
    parser.add_argument('--way', type=int, default=5)
    parser.add_argument('--num-epochs', type=int, default=100)
    parser.add_argument('--seed', type=int, default=0)

    parser.add_argument('--save-model-path', type=str)

    parser.add_argument('--view1-col', type=str, default='view1')
    parser.add_argument('--view2-col', type=str, default='view2')
    parser.add_argument('--label-col', type=str, default='tag')
    args = parser.parse_args()

    np.random.seed(args.seed)

    print('loading dataset')
    dataset = Dataset(args.data_path, view1_col=args.view1_col, view2_col=args.view2_col, label_col=args.label_col)
    n_cluster = len(dataset.id_to_label) - 1
    print ("num of class = %d" %n_cluster)

    if args.model_path is not None:
        id_to_token, token_to_id, vocab_size, word_emb_size, model = multiview_encoders.load_model(args.model_path)
        print('loaded model')
    else:
        id_to_token, token_to_id, vocab_size, word_emb_size, model = multiview_encoders.create_model_from_embeddings(
            args.glove_path, dataset.id_to_token, dataset.token_to_id)
        print('created randomly initialized model')
    print('vocab_size', vocab_size)

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    expressions = (model, optimizer)

    pre_acc, pre_state = 0., None
    pretrain_method = {
        'ae': pretrain.pretrain_ae,
        'qt': pretrain.pretrain_qt,
    }[args.pre_model]
    for epoch in range(1, args.pre_epoch + 1):
        model.train()
        perm_idx = np.random.permutation(dataset.trn_idx)
        trn_loss, _ = pretrain_method(dataset, perm_idx, expressions, train=True)
        model.eval()
        _, tst_acc = pretrain_method(dataset, dataset.tst_idx, expressions, train=False)
        if tst_acc > pre_acc:
            pre_state = copy.deepcopy(model.state_dict())
            pre_acc = tst_acc
        print('{} epoch {}, train_loss={:.4f} test_acc={:.4f}'.format(datetime.datetime.now(), epoch, trn_loss, tst_acc))
        if args.save_model_path is not None:
            save_model_path = f'{args.save_model_path}_pre_e{epoch}.dat'
            state = {
                'model_state': model.state_dict(),
                'id_to_token': dataset.id_to_token,
                'word_emb_size': word_emb_size
            }
            with open(expand(save_model_path), 'wb') as f:
                torch.save(state, f)
            print('saved model to ', save_model_path)

            save_model_path = f'{args.save_model_path}_pre_best_e{epoch}.dat'
            state = {
                'model_state': pre_state,
                'id_to_token': dataset.id_to_token,
                'word_emb_size': word_emb_size
            }
            with open(expand(save_model_path), 'wb') as f:
                torch.save(state, f)
            print('saved model to ', save_model_path)

    if args.pre_epoch > 0:
        # load best state
        model.load_state_dict(pre_state)
        print('loaded best state')

        # deepcopy pretrained views into v1 and/or view2
        {
            'ae': pretrain.after_pretrain_ae,
            'qt': pretrain.after_pretrain_qt,
        }[args.pre_model](model)

        # reinitialiate optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
        expressions = (model, optimizer)
        print('applied post-pretraining')

    kmeans = sklearn.cluster.KMeans(n_clusters=n_cluster, max_iter=300, verbose=0, random_state=0)
    latent_z1s, golds = transform(dataset, dataset.trn_idx, model, encoder='v1')
    pred1s = kmeans.fit_predict(latent_z1s)

    lgolds, lpreds = [], []
    for g, p in zip(golds, list(pred1s)):
        if g > 0:
            lgolds.append(g)
            lpreds.append(p)
    prec, rec, f1 = cluster_metrics.calc_prec_rec_f1(gnd_assignments=torch.LongTensor(lgolds).to(device), pred_assignments=torch.LongTensor(lpreds).to(device))
    acc = cluster_metrics.calc_ACC(torch.LongTensor(lpreds).to(device), torch.LongTensor(lgolds).to(device))
    silhouette, davies_bouldin = sklearn.metrics.silhouette_score(latent_z1s, pred1s, metric='euclidean'), sklearn.metrics.davies_bouldin_score(latent_z1s, pred1s)

    print('{} pretrain: eval prec={:.4f} rec={:.4f} f1={:.4f} acc={:.4f} sil={:.4f}, db={:.4f}'.format(datetime.datetime.now(), prec, rec, f1, acc, silhouette, davies_bouldin))
    perm_idx = dataset.trn_idx
    pred2s, centroids1, centroids2, pred1s_perm_idx, preds2_perm_idx = None, None, None, None, None
    for epoch in range(1, args.num_epochs + 1):
        trn_loss = 0.

        shot, way, query = 5, args.way, 15
        sampler1 = CategoriesSampler(pred1s, args.pt_batch, way, shot+query)
        train1_batches = [[dataset[perm_idx[idx]] for idx in indices] for indices in sampler1]
        trn_loss += do_pass(train1_batches, shot, way, query, expressions, encoder='v2')

        latent_z2s, _ = transform(dataset, perm_idx, model, encoder='v2')
        centroids2 = calc_centroids(latent_z2s, pred1s, n_cluster)
        kmeans2 = sklearn.cluster.KMeans(n_clusters=n_cluster, init=centroids2, max_iter=10, verbose=0)
        pred2s = kmeans2.fit_predict(latent_z2s)
        pred2s_perm_idx = perm_idx.copy()
        tst_latent_z2s, _ = transform(dataset, dataset.tst_idx, model, encoder='v2')
        tst_pred2s = kmeans2.predict(tst_latent_z2s)

        sampler2 = CategoriesSampler(pred2s, args.pt_batch, way, shot+query)
        train2_batches = [[dataset[perm_idx[idx]] for idx in indices] for indices in sampler2]
        trn_loss += do_pass(train2_batches, shot, way, query, expressions, encoder='v1')

        perm_idx = np.random.permutation(dataset.trn_idx)
        latent_z1s, golds = transform(dataset, perm_idx, model, encoder='v1')
        centroids1 = calc_centroids(latent_z1s, pred2s, n_cluster)
        kmeans1 = sklearn.cluster.KMeans(n_clusters=n_cluster, init=centroids1, max_iter=10, verbose=0)
        pred1s = kmeans1.fit_predict(latent_z1s)
        pred1s_perm_idx = perm_idx.copy()
        tst_latent_z1s, _ = transform(dataset, dataset.tst_idx, model, encoder='v1')
        tst_pred1s = kmeans1.predict(tst_latent_z1s)

        f1 = cluster_metrics.calc_f1(gnd_assignments=torch.LongTensor(tst_pred1s).to(device), pred_assignments=torch.LongTensor(tst_pred2s).to(device))
        acc = cluster_metrics.calc_ACC(torch.LongTensor(tst_pred2s).to(device), torch.LongTensor(tst_pred1s).to(device))

        print('TEST f1={:.4f} acc={:.4f}'.format(f1, acc))

        lgolds, lpreds = [], []
        for g, p in zip(golds, list(pred1s)):
            if g > 0:
                lgolds.append(g)
                lpreds.append(p)
        prec, rec, f1 = cluster_metrics.calc_prec_rec_f1(gnd_assignments=torch.LongTensor(lgolds).to(device), pred_assignments=torch.LongTensor(lpreds).to(device))
        acc = cluster_metrics.calc_ACC(torch.LongTensor(lpreds).to(device), torch.LongTensor(lgolds).to(device))
        silhouette, davies_bouldin = sklearn.metrics.silhouette_score(latent_z1s, pred1s, metric='euclidean'), sklearn.metrics.davies_bouldin_score(latent_z1s, pred1s)

        print('{} epoch {}, eval prec={:.4f} rec={:.4f} f1={:.4f} acc={:.4f} sil={:.4f}, db={:.4f}'.format(
            datetime.datetime.now(), epoch, prec, rec, f1, acc, silhouette, davies_bouldin))

    if args.save_model_path is not None:
        pred1s = torch.from_numpy(pred1s)
        if pred2s is not None:
            pred2s = torch.from_numpy(pred2s)
        state = {
            'model_state': model.state_dict(),
            'id_to_token': dataset.id_to_token,
            'word_emb_size': word_emb_size,
            'v1_assignments': pred1s,
            'v2_assignments': pred2s,
            'pred1s_perm_idx': pred1s_perm_idx,
            'pred2s_perm_idx': pred2s_perm_idx
        }
        with open(expand(args.save_model_path), 'wb') as f:
            torch.save(state, f)
        print('saved model to ', args.save_model_path)
Example #17
0
def run_for_company(in_csv_file, in_max_rows, examples_writer, target_company,
                    no_preprocess):
    f_in = open(expand(in_csv_file), 'r')
    node_by_id = {}
    start_node_ids = []
    dict_reader = csv.DictReader(f_in)
    next_by_prev = {}
    for row in dict_reader:
        id = row['tweet_id']
        prev = row['in_response_to_tweet_id']
        next_by_prev[prev] = id
        if prev == '' and ('@' + target_company) in row['text']:
            start_node_ids.append(id)
        node_by_id[id] = row
        if in_max_rows is not None and len(node_by_id) >= in_max_rows:
            print('reached max rows', in_max_rows, '=> breaking')
            break
    print('len(node_by_id)', len(node_by_id))
    print('len(start_node_ids)', len(start_node_ids))
    count_by_status = defaultdict(int)
    count_by_count = defaultdict(int)

    preprocessor = twitter_airlines.Preprocessor(
    ) if not no_preprocess else NullPreprocessor()

    for i, start_node_id in enumerate(start_node_ids):
        conversation_texts = []
        first_utterance = None
        is_valid = True
        node = node_by_id[start_node_id]
        cust_twitter_id = node['author_id']
        while True:
            text = node['text'].replace('\n', ' ')
            if node['inbound'] == 'True':
                start_tok = '<cust__'
                end_tok = '__cust>'
            else:
                start_tok = '<rep__'
                end_tok = '__rep>'

            _valid, text = preprocessor(text,
                                        info_dict={
                                            'target_company': target_company,
                                            'cust_twitter_id': cust_twitter_id
                                        })
            if not _valid:
                is_valid = False
            text = start_tok + ' ' + text + ' ' + end_tok
            if first_utterance is None:
                first_utterance = text
            else:
                conversation_texts.append(text)
            response_id = next_by_prev.get(node['tweet_id'], None)
            if response_id is None:
                count_by_count[len(conversation_texts) + 1] += 1
                if is_valid:
                    examples_writer.writerow({
                        'first_tweet_id':
                        start_node_id,
                        'first_utterance':
                        first_utterance,
                        'context':
                        ' '.join(conversation_texts)
                    })
                    count_by_status['accept'] += 1
                else:
                    count_by_status['not_valid'] += 1
                break
            if response_id not in node_by_id:
                count_by_status['response id not found'] += 1
                print('warning: response_id', response_id,
                      'not found => skipping conversation')
                break
            node = node_by_id[response_id]
    print(count_by_status)