Esempio n. 1
0
def build_vocabs(data_, min_freq=50):
    """Builds separate vocabs for text and code."""
    word_freqs = collections.defaultdict(int)
    code_freqs = collections.defaultdict(int)
    for example in data_:
        for word in example.text:
            word_freqs[word] += 1
        for word in example.code_sequence:
            code_freqs[word] += 1
    return data.get_vocab(word_freqs, min_freq), data.get_vocab(code_freqs, min_freq)
Esempio n. 2
0
 def build_vocab(self):
     tokens = collections.defaultdict(int)
     self.file.seek(0)
     while True:
         try:
             example = pickle.load(self.file)
         except EOFError:
             break
         for token in example['code']:
             tokens[token] += 1
     return data.get_vocab(tokens, 1)
Esempio n. 3
0
def build_vocab(data_, min_freq=50):
    """Builds single vocab."""
    freqs = collections.defaultdict(int)
    def update_freqs(words):
        for word in words:
            freqs[word] += 1
    for example in data_:
        update_freqs(example.text)
        update_freqs(example.code_sequence)
        for column in example.schema.args.items():
            update_freqs(column)
    return data.get_vocab(freqs, min_freq)
import spacy

from scipy import spatial
from tqdm import tqdm

import pandas as pd

sys.path.insert(0, './resources')
import config_parser, constant, eval_metric


# In[2]:


vocab = data_utils.get_vocab()


# In[3]:


vocab_set = vocab


# In[5]:


nlp = spacy.load('en')


# In[6]:
Esempio n. 5
0
    parser.add_argument('--hidden_size',
                        type=int,
                        default=1500,
                        help='dimension of lstm hidden states')
    # HIDDEN SIZE ????

    parser.add_argument('--num_layers',
                        type=int,
                        default=2,
                        help='number of layers in lstm')

    parser.add_argument('--num_epochs', type=int, default=5)

    parser.add_argument('--batch_size', type=int, default=1)

    parser.add_argument('--num_workers', type=int, default=1)

    parser.add_argument('--learning_rate', type=float, default=0.001)

    parser.add_argument('--ngpu', type=int, default=2)

    latex_path = "/Users/Kamoya/OCR/data/latex_snippets/"
    imgs_path = "/Users/Kamoya/OCR/data/img_snippets/"

    args = parser.parse_args()
    data = ImageSnippetDataset(args.latex_dir, args.image_dir)
    vocab = data.get_vocab()
    # print(vocab["<SOS>"])
    train(args, vocab)