Ejemplo n.º 1
0
def main():
  top_obj = class_defs.top ()
  input_list_fp = open ("input_file_list.txt")
  key_list_fp = open ("key_file_list.txt")

  for ifile,kfile in zip(input_list_fp, key_list_fp):
    ifile = ifile.strip ('\n')
    kfile = kfile.strip ('\n')
    print ("Now Processing {}, {}".format(ifile, kfile))

    top_obj.docs[ifile] = class_defs.document (top_obj, ifile, kfile)


  input_list_fp.close ()
  key_list_fp.close ()
  #Select a subset of the negative data generated randomly
  utils_temp.select_neg_data (top_obj, 2)
  print ("Pos Create Ana Encountered : ", top_obj.pos_create_ana_encountered)
  print ("Number of Positive and Negative Samples Generated")
  print ("Positive : {} Negative {} Selected Negative {}".format (len(top_obj.pos_list), len(top_obj.neg_list), len(top_obj.selected_neg_list)))

  #Debug Prints
  #for key, dobj in top_obj.docs.items():
  #  utils.compare_total_antecedents(dobj)

  utils.create_features (top_obj)
Ejemplo n.º 2
0
    def train(self, X_train, Y_train):

        y_train = torch.from_numpy(Y_train.astype(int)).type(torch.LongTensor)

        tot_loss = 0.0

        all_preds = []

        for t in range(self.epochs):

            epoch_loss = 0.0

            #model.train()

            y_pred = self.model(A, utils.create_features(graph))

            all_preds.append(y_pred)

            loss = self.loss_function(y_pred[X_train], y_train)

            self.optimizer.zero_grad()

            epoch_loss += loss
            tot_loss += loss

            loss.backward()
            self.optimizer.step()

            print(str(t), 'epoch_loss:' + str(epoch_loss),
                  'total loss:' + str(tot_loss))

        self.all_preds = all_preds
Ejemplo n.º 3
0
 def process_graph(self, graph_path, batch_loss):
     """
     Reading a graph and doing a forward pass on a graph with a time budget.
     :param graph_path: Location of the graph to process.
     :param batch_loss: Loss on the graphs processed so far in the batch.
     :return batch_loss: Incremented loss on the current batch being processed.
     """
     data = json.load(open(graph_path))
     graph, features = create_features(data, self.model.identifiers)
     node = random.choice(list(graph.nodes()))
     attention_loss = 0
     for t in range(self.args.time):
         predictions, node, attention_score = self.model(data, graph, features, node)
         target, prediction_loss = calculate_predictive_loss(data, predictions)
         batch_loss = batch_loss + prediction_loss
         if t < self.args.time-2:
             attention_loss += (self.args.gamma**(self.args.time-t))*torch.log(attention_score)
     reward = calculate_reward(target, predictions)
     batch_loss = batch_loss-reward*attention_loss
     self.model.reset_attention()
     return batch_loss
Ejemplo n.º 4
0
 def score(self):
     """
     Scoring the test set graphs.
     """
     print("\n")
     print("\nScoring the test set.\n")
     self.model.eval()
     self.predictions = []
     for data in tqdm(self.test_graphs):
         data = json.load(open(data))
         graph, features = create_features(data, self.model.identifiers)
         node_predictions = []
         for _ in range(self.args.repetitions):
             node = random.choice(list(graph.nodes()))
             for _ in range(self.args.time):
                 prediction, node, _ = self.model(data, graph, features, node)
             node_predictions.append(np.argmax(prediction.detach()))
             self.model.reset_attention()
         prediction = max(set(node_predictions), key=node_predictions.count)
         self.score_graph(data, prediction)
     self.accuracy = float(np.mean(self.predictions))
     print("\nThe test set accuracy is: "+str(round(self.accuracy, 4))+".\n")
Ejemplo n.º 5
0
    data_dict = pickle.load(data_file)

# coerce data features to numeric values or a NaN
df = pd.DataFrame(data_dict).transpose().apply(pd.to_numeric, errors="coerce")

### Task 2: Remove outliers
# remove the TOTAL key from the dict because it's an outlier in the data and represents an aggregate value
# 'THE TRAVEL AGENCY IN THE PARK' doesn't represent a person, but rather an entity
df = df.drop(["TOTAL", "THE TRAVEL AGENCY IN THE PARK"], errors="ignore", axis=0)
df = df.fillna(0)

### Task 3: Create new feature(s)
df["pct_poi_messages"] = df.apply(calculate_pct_poi_msgs, axis=1)

### Store to my_dataset for easy export below.
features, labels, my_dataset = create_features(df, features_list)

# construct a PCA to use as a pipeline step
pca = PCA()

### Task 4: Try a variety of classifiers

# models = trial_models
# these models below were tuned.
# comment out this variable and comment in the `models = trial_models` variable above to run the trial models
models = [
    {
        "title": "DecisionTreeClassifier (RobustScaler + PCA) -- Tuned",
        "pipeline": Pipeline(
            steps=[
                ("scaler", RobustScaler()),
Ejemplo n.º 6
0
from random import shuffle
from time import sleep
import time
from datetime import datetime
from sklearn.model_selection import train_test_split
import utils
import pandas as pd
from keras.layers import Dropout
from keras import regularizers

nlp = sp.load('en_core_web_lg')

with open(Path('../data/models/features/data_3.json'), 'r') as f:
    datalist = json.loads(f.read())  # dictionary:

data, labels, ngrams = utils.create_features(
    datalist)  # fulldata[:300] sind die chunk vectors

docvec = [(np.fromstring(instance['title_vec'].strip('[]'), sep=',') +
           np.fromstring(instance['abstract_vec'].strip('[]'), sep=',') +
           np.fromstring(instance['text_vec'].strip('[]'), sep=',')) / 3
          for instance in datalist]
docvec = np.array(docvec)

positive_examples = sum(labels)

negative_ratio = 1

# sample equal number of negative and positive labels
neg_idx = [i for i in range(labels.shape[0])
           if labels[i] == 0]  # indices of negative examples
neg_idx = np.random.choice(np.array(neg_idx),
Ejemplo n.º 7
0
7
times faster for
 
= 750
.
C
ONCLUSION
The self loop feedback gating mechanism of recurrent networks has been derived from first princi-
ples via a postulate of invariance to time warpings. 
'''

model = keras.models.load_model('../data/models/keras/model4.h5')

doc_data = utils.simple_preprocess(title=title, abstract=abstract, text=text)

fulldata, ngrams = utils.create_features(doc_data, labels=False)

docvec = [(np.fromstring(instance['title_vec'].strip('[]'), sep=',') +
           np.fromstring(instance['abstract_vec'].strip('[]'), sep=',') +
           np.fromstring(instance['text_vec'].strip('[]'), sep=',')) / 3
          for instance in doc_data]
docvec = np.array(docvec)

#data = fulldata[:,:300] - docvec
data = fulldata[:, 300:]  # 11 features
data = data[:, [7]]

predictions = model.predict(data)

df_ = np.array([
    ngrams.reshape((-1, )),
Ejemplo n.º 8
0
from random import shuffle
from time import sleep
import time
from datetime import datetime
from sklearn.model_selection import train_test_split
import utils
import pandas as pd

nlp = sp.load('en_core_web_lg')


with open(Path('../data/models/features/data_3.json'), 'r') as f:
    datalist = json.loads(f.read()) # dictionary:


fulldata, labels, ngrams = utils.create_features(datalist)


positive_examples = sum(labels)

negative_ratio = 2

# sample equal number of negative and positive labels
neg_idx = [i for i in range(labels.shape[0]) if labels[i] == 0] # indices of negative examples
neg_idx = np.random.choice(np.array(neg_idx), positive_examples*negative_ratio, replace=False)

pos_idx = [i for i in range(labels.shape[0]) if labels[i] == 1] # indices of positive examples
pos_idx = np.random.choice(np.array(pos_idx), positive_examples, replace=False)

idx = np.hstack((pos_idx, neg_idx))
Ejemplo n.º 9
0
        self.softmax = nn.Softmax()

    def forward(self, Adj_matrix, input_features):

        x = self.layer1(Adj_matrix, input_features)
        x = self.activation(x)
        x = self.layer2(Adj_matrix, x)
        x = self.softmax(x)

        return x


# In[6]:

model = GCN(inputs_shape=utils.create_features(graph).shape[1],
            outputs_shape=4,
            n_classes=2,
            activation='Tanh')

# In[7]:

trainer = train.Trainer(model,
                        optimizer=optim.Adam(model.parameters(), lr=0.01),
                        loss_function=F.cross_entropy,
                        epochs=250)

# In[8]:

trainer.train(X_train, Y_train)