Esempio n. 1
0
    def __init__(self, config, train_set=None):
        # Book-keeping.
        self.config = config
        if self.config['pretrained']:
            self.init_saved_network(self.config['pretrained'])
        else:
            assert train_set is not None
            print('Train vocab: {}'.format(len(train_set.vocab)))
            vocab = Counter()
            for w in train_set.vocab:
                if train_set.vocab[w] >= config['min_freq']:
                    vocab[w] = train_set.vocab[w]
            print('Pruned train vocab: {}'.format(len(vocab)))
            # Building network.
            word_model = WordModel(embed_size=self.config['embed_size'],
                                   filename=self.config['embed_file'],
                                   embed_type=self.config['embed_type'],
                                   top_n=self.config['top_vocab'],
                                   additional_vocab=vocab)
            self.config['embed_size'] = word_model.embed_size
            self._init_new_network(train_set, word_model)

        num_params = 0
        for name, p in self.network.named_parameters():
            print('{}: {}'.format(name, str(p.size())))
            num_params += p.numel()
        print('#Parameters = {}\n'.format(num_params))

        self._init_optimizer()
Esempio n. 2
0
def doc_parsing(doc):
    listField = []
    listTable = []
    fieldName = ''
    fileDesc = ''
    for doc_part in doc.element.body:
        if isinstance(doc_part, CT_P):
            pg = Paragraph(doc_part, doc).text
            if (pg.find('<table_name>') >= 0 and pg.find('</table_name>') > 0):
                fieldName = pg[pg.find('<table_name>') +
                               12:pg.find('</table_name>')] + '.java'
                fileDesc = pg[0:pg.find('<table_name>')]

        if (isinstance(doc_part, CT_Tbl) and fieldName != ''):
            tableinfo = TableInfo()
            tableinfo.fileName = fieldName
            tableinfo.fileDesc = fileDesc
            tb1 = Table(doc_part, doc)
            isMytable = doc_mytable(tb1)
            if (isMytable == False):
                continue
            for row in range(len(tb1.rows)):
                if (row == 0):
                    continue
                w2 = WordModel()
                w2.field = getCellText(
                    tb1, row,
                    dict.get("field") if dict.has_key("field") else '')
                w2.fieldName = getCellText(
                    tb1, row,
                    dict.get("fieldName") if dict.has_key("fieldName") else '')
                w2.fieldType = getCellText(
                    tb1, row,
                    dict.get("fieldType") if dict.has_key("fieldType") else '')
                w2.comment = getCellText(
                    tb1, row,
                    dict.get("comment") if dict.has_key("comment") else '')
                w2.must = getCellText(
                    tb1, row,
                    dict.get("must") if dict.has_key("must") else '')
                # print w2.display()
                w2.fieldType = dataConvert(w2.fieldType)
                listField.append(w2)
                # for col in range(len(tb1.columns)):
                #     cell_table = tb1.cell(row, col)
                #     table_nested_parsing(cell_table, row, col)
            tableinfo.listField = listField
            listTable.append(tableinfo)
            fieldName = ''
            listField = []
    return listTable
Esempio n. 3
0
import coloredlogs, logging
from encoder import Encoder
from torch.autograd import Variable
import torch
import torch.nn as nn
import torch.nn.functional as F

# Create a logger object.
logger = logging.getLogger(__name__)

coloredlogs.install(level='DEBUG')
coloredlogs.install(fmt='%(asctime)s,%(msecs)03d %(levelname)s %(message)s')

dataset = json.load(open('data/dev-v1.1.json'))

word_model = WordModel()
logger.warning('Loading Vocab ...')
word_model.load_vocab()
vocab_size = word_model.vocab.length()
encoder = Encoder(vocab_size=vocab_size)
optimiser = torch.optim.SGD(encoder.parameters(), lr=0.0001)
criterion = nn.NLLLoss()


def train_model(context, question, answer, target_start, target_end):
    context, question, answer = Variable(context), Variable(
        question), Variable(answer)
    context = context.unsqueeze(0)
    question = question.unsqueeze(0)
    answer = answer.unsqueeze(0)
Esempio n. 4
0
import json
import nltk

nltk.download('perluniprops')
nltk.download('nonbreaking_prefixes')

from word_model import Vocab, WordModel
import pickle
import coloredlogs, logging
from encoder import Encoder
from torch.autograd import Variable
import torch
import torch.nn as nn

# Create a logger object.
logger = logging.getLogger(__name__)

coloredlogs.install(level='DEBUG')
coloredlogs.install(fmt='%(asctime)s,%(msecs)03d %(levelname)s %(message)s')

dataset = json.load(open('data/dev-v1.1.json'))

word_model = WordModel()
logger.warning('Generating Vocab ...')
word_model.store_into_vocab(dataset)
Esempio n. 5
0
    headers = {"Content-Type": "application/json", "api_key": CORTICAL_API_KEY}
    response = requests.post(url, headers=headers, data=term)
    return json.loads(response.content).pop()


def fingerprintToTerm(fingerprint):
    url = "http://api.cortical.io:80/rest/expressions/similarTerms?retinaName=en_associative"
    headers = {"Content-Type": "application/json", "api_key": CORTICAL_API_KEY}
    response = requests.post(url,
                             headers=headers,
                             data=json.dumps(fingerprint))
    return json.loads(response.content)


auth = tweepy.OAuthHandler(TWITTER_KEY, TWITTER_SECRET)
auth.set_access_token(TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_SECRET)
model = WordModel()

api = tweepy.API(auth)

public_tweets = api.user_timeline("rhyolight", count=100)
# print type(public_tweets)
for tweet in public_tweets:
    # print tweet.text
    cleanTweet = cleanText(tweet.text)
    print cleanTweet
    sdr = termToBitmap(cleanTweet)
    # print sdr
    terms = fingerprintToTerm(sdr)
    print '\tclosest term: %s' % terms[0]['term']
Esempio n. 6
0
    # To convert words in the input to indices of the embeddings matrix
    word_to_idx = {
        word: i
        for i, word in enumerate(gensim_embeds.vocab.keys())
    }

    # Set hyperparameters
    # Number of output classes (9)
    n_classes = len(TAG_INDICES)
    n_epochs = EPOCHS
    p = DROPOUT
    report_every = 1

    # Set up and initialize model
    model = WordModel(pretrained_embeds, 100, len(word_to_idx), n_classes, p)
    loss_function = NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.6)
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    # Training loop
    for e in range(n_epochs + 1):
        total_loss = 0
        for sent in data["train"]:

            # (1) Set gradient to zero for new example: Set gradients to zero before pass
            model.zero_grad()

            # (2) Encode sentence and tag sequence as sequences of indices