Exemple #1
0
    def _ldWordVectors(self, path, vocab):
        """
    Reads in word embeddings restricted to a subset of tokens.
    
    Parameters
    ----------
    path : str
        path where word embeddings are saved
    vocab : { Object:int } 
        which embeddings we should pull and their index
    
    Returns
    ----------
    embeddings : (N, D) numpy float array 
        embedding matrix
    """

        wvecs = {}

        embDim = 0

        f = open_file(path, 'rt', encoding='utf8')
        for i, line in enumerate(f):
            word = line[:line.find(' ')]

            if word in vocab:
                try:
                    wemb = np.asarray(
                        [float(w) for w in line.strip().split()[1:]])
                    embDim = wemb.shape[0]
                    wvecs[vocab[word]] = wemb
                except Exception as ex:
                    raise ex

            # if not i % 1000000:
            #   print ('Loading word vecs: %.1fM checked, %d found' % (i/10.**6, len(wvecs)))
        f.close()

        embeddings = np.zeros((len(vocab), embDim))
        for index in wvecs:
            embeddings[index, :] = wvecs[index]

        print('Loaded word vecs: %d unigrams found' % (len(wvecs)))

        return embeddings
Exemple #2
0
 def serialize(self, path):
     outFile = open_file(path, 'wb')
     pickle.dump(self, outFile)
     outFile.close()
Exemple #3
0
    def deserialize(path):
        f = open_file(path, 'rb')
        classifier = pickle.load(f)
        f.close()

        return classifier
def loadData(path, depvars, proptest=None):
    """
  Read data.  Each line contains a single JSON record with the tweet text, labels it's
  been assigned, and train/dev/test fold.  Missing labels are denoted with null values.
  If examples are not assigned to folds, then we train the model by cross-fold validation,
  otherwise we train it by tuning on dev set.
  
  Parameters
  ----------
  path : str
      path to data file
  depvars : [ str ]
      dependent variables to extract, extracts all dependent 
  proptest :  float
      if set, constructs test set by setting this proportion of examples as test
  
  Returns
  ----------
  trainDocs   :  [ str ]
      documents to train on, just text
  testDocs    : [ str ]
      documents to test on, text
  trainLabels : [ [ str ] ]
      labels for train set, one for each dependent variable
  testLabels  : [ [ str ] ]
      labels for test set
  folds       : numpy int vector
      fold each example is placed in
  tuningFolds : [ int ]
      which folds to evaluate on
  depvars     : [ str ] 
      all dependent variables if depvars was empty
  alphabets   : { str:{ str:int } } 
      dictionary of labels for each label type
  """

    random.seed(SEED)
    np.random.seed(SEED)

    labelAlphabets = {v: Alphabet() for v in depvars}
    trainDocs = []
    testDocs = []
    trainLabels = []
    testLabels = []

    # keep track of label frequency
    labelCounts = [{} for v in depvars]

    # See if we need to split into 5 folds, or if they are given explicitly.
    # When numeric folds are given, assumes highest index fold is the test fold,

    allDepVars = set()  # all dependent features in data
    hasDevFold = False  # tuning fold explicitly set
    testFoldIdx = -1
    f = open_file(os.path.join(DATA_DIR, path), 'rt')

    for ln in f:
        try:
            tweet = json.loads(ln)
        except ValueError:
            continue
        # keep track of all dependent variables
        allDepVars |= set([v for v in tweet['label'].keys()])

        if 'fold' in tweet and tweet['fold'] == 'dev':
            hasDevFold = True
        elif 'fold' in tweet and (type(tweet['fold']) == int
                                  or re.match('\d+', tweet['fold'])):
            testFoldIdx = max(testFoldIdx, int(tweet['fold']))

    f.close()

    if not depvars:
        depvars = sorted(list(allDepVars))
        labelCounts = [{} for v in depvars]
        labelAlphabets = {v: Alphabet() for v in depvars}

    if testFoldIdx > -1:  # folds are already numbered, treat highest as test fold
        NUM_FOLDS = 1 + testFoldIdx
        tuningFolds = list(range(NUM_FOLDS - 1))
    elif not hasDevFold:  # assign to folds myself
        NUM_FOLDS = 5
        tuningFolds = list(range(NUM_FOLDS - 1))
        # TODO these are not being assigned!
    else:
        NUM_FOLDS = 3
        tuningFolds = [1]
    testFoldIdx = NUM_FOLDS - 1
    folds = []

    f = open_file(os.path.join(DATA_DIR, path), 'rt')
    for ln in f:
        try:
            tweet = json.loads(ln)
        except ValueError:
            continue

        # read from tweet fields

        # make our own test set by rolling a die, if fold is not given
        if (proptest is not None) and ('fold' not in tweet):
            tweet['fold'] = 'test' if random.random() < proptest else 'train'

        fold = tweet['fold']
        labels = [
            tweet['label'][v] if
            (v in tweet['label']) and (tweet['label'][v] is not None) and
            ((type(tweet['label'][v]) != str) or tweet['label'][v].strip())
            else None for v in depvars
        ]

        if 'text' in tweet:
            text = tweet['text']
        else:
            text = tweet['tweet'][
                'text']  # pull out text from the embedded tweet

        for alpha, label, counts in zip([labelAlphabets[v] for v in depvars],
                                        labels, labelCounts):
            if label != None:
                alpha.put(label)
                if label not in counts:
                    counts[label] = 0
                counts[label] += 1

        if fold == 'train':
            trainDocs.append(text)
            trainLabels.append(labels)

            if hasDevFold:  # train is fold 0, dev is 1, test is 2
                folds.append(0)
            else:  # TODO what if hasDevFold == False?? is this correct moving this into an else?
                folds.append(np.random.randint(0, NUM_FOLDS - 1))
        elif fold == 'dev':  # we have an explicitly set dev fold
            trainDocs.append(text)
            trainLabels.append(labels)
            folds.append(1)  # TODO is this correct now if hasDevFold?
        elif fold == 'test':
            testDocs.append(text)
            testLabels.append(labels)
        elif type(tweet['fold']) == int or re.match('\d+', tweet['fold']):
            if int(tweet['fold']) == testFoldIdx:
                testDocs.append(text)
                testLabels.append(labels)
            else:
                trainDocs.append(text)
                trainLabels.append(labels)
                folds.append(int(tweet['fold']))
        else:  # Should never hit this
            raise Exception('Example missing fold!', text, labels)

    f.close()

    alphabets = {v: alpha._wToI for v, alpha in labelAlphabets.items()}

    # make the class with the most examples be the negative one.  May want to change this
    # eventually to let user set positive class.
    for counts, v in zip(labelCounts, depvars):
        majWord = max([(c, w) for w, c in counts.items()])[1]
        oldNegWord = ([w for w in alphabets[v] if alphabets[v][w] == 0])[0]
        alphabets[v][majWord], alphabets[v][oldNegWord] = alphabets[v][
            oldNegWord], alphabets[v][majWord]

    return trainDocs, testDocs, trainLabels, testLabels, folds, tuningFolds, depvars, alphabets
 def write(self, *args):
     self.stdoutFile.write(*args)
     if self.outPath:
         self.outFile = open_file(self.outPath, 'at')
         self.outFile.write(*args)
         self.outFile.close()