print(time2 - time1)

indices = final_logits.argsort(axis=0)[-args.amount:].reshape(args.amount)

word_logits = np.dot(word_emb, publication_emb.reshape(args.emb_size,
                                                       1)) + word_bias

top_articles = word_articles[indices.tolist()[0]]

broadcasted_words_per_article = top_articles.toarray() * word_logits.T

sorted_word_indices = broadcasted_words_per_article.argsort(axis=1)

return_articles = []

raw_data = Articles(args.real_data_path)
print(len(raw_data))
id_to_word = {v: k for k, v in final_word_ids.items()}

i = 0
for idx in indices.tolist()[0]:
    current_article = raw_data[int(idx)]
    current_article["logit"] = float(final_logits[int(idx)])
    current_sorted_words = sorted_word_indices[i]
    top_words = []
    least_words = []
    for top_word in current_sorted_words[-20:]:
        word = id_to_word[top_word]
        if "unused" not in word and "##" not in word and len(word) > 1:
            top_words.append(word)
    for least_word in current_sorted_words[:20]:
Beispiel #2
0
    print("Cannot use GPU. Using CPU instead.")
print(f"Device: {device}")

# set output directory path
output_path = Path(args.output_dir)

# tensboard log and graph output folder declaration
log_tensorboard_dir = output_path / "runs" / args.word_embedding_type
writer = SummaryWriter(log_tensorboard_dir)

# load datasets
train_path = Path(args.train_path)
test_path = Path(args.test_path)
eval_path = Path(args.eval_path)

train_data = Articles(train_path)
test_data = Articles(test_path)
eval_data = Articles(eval_path)
print("Data Loaded")

# check if items need to be tokenized
if args.map_items and args.tokenize:
    train_data.tokenize()
    test_data.tokenize()
    eval_data.tokenize()
    print("Items tokenized")

# create and save or load dictionaries based on arguments
if args.create_dicts:
    final_word_ids, final_url_ids, final_publication_ids = dictionary.create_merged_dictionaries(
        train_data.examples, "target")
Beispiel #3
0
                    help="This is required to load dictionaries")

parser.add_argument('--dataset_path',
                    type=expand_path,
                    required=True,
                    help='Path to data to be ranked.')

args = parser.parse_args()

dict_dir = Path(args.dict_dir)
final_word_ids, final_url_ids, final_publication_ids = dictionary.load_dictionaries(
    dict_dir)
print("Dictionaries loaded.")

data_path = Path(args.dataset_path)
dataset = Articles(data_path)
print("Data loaded.")

dataset.tokenize()
print("Data tokenized.")
word_counter = collections.Counter()
for example in dataset.examples:
    word_counter.update(example['text'])

unique_words = [word for word in word_counter.keys()]
len(set(unique_words))

abs_model_path = Path(args.model_path)
kwargs = dict(n_publications=len(final_publication_ids),
              n_articles=len(final_url_ids),
              n_attributes=len(final_word_ids),
if torch.cuda.is_available() and args.use_gpu:
    device = "cuda"
elif not args.use_gpu:
    device = "cpu"
else:
    device = "cpu"
    print("Cannot use GPU. Using CPU instead.")
print(f"Device: {device}")
print("-------------------")

# set output directory path
output_path = Path(args.output_dir)

# load in dataset
raw_data_path = Path(args.dataset_path)
raw_data = Articles(raw_data_path)
print("Data Loaded")
print("-------------------")

# load dictionaries from path
dictionary_dir = Path(args.dict_dir)
final_word_ids, final_url_ids, final_publication_ids = dictionary.load_dictionaries(
    dictionary_dir)
print("Dictionaries Loaded")
print("-------------------")

# map items to their dictionary values
if args.map_items:
    # initialize tokenizer from BERT library
    tokenizer = BertWordPieceTokenizer(args.tokenizer_file, lowercase=True)
    print("Tokenizer Initialized!")
Beispiel #5
0
                    help='Path to data to be ranked.')

parser.add_argument('--mapped_data_dir',
                    type=expand_path,
                    required=True,
                    help="The place to store the mapped data.")

args = parser.parse_args()

dict_dir = Path(args.dict_dir)
final_word_ids, final_url_ids, final_publication_ids = dictionary.load_dictionaries(
    dict_dir)
print("Dictionaries loaded.")

data_path = Path(args.dataset_path)
dataset = Articles(data_path)
print("Data loaded.")

abs_model_path = Path(args.model_path)
kwargs = dict(n_publications=len(final_publication_ids),
              n_articles=len(final_url_ids),
              n_attributes=len(final_word_ids),
              emb_size=100,
              sparse=False,
              use_article_emb=False,
              mode='mean')
model = InnerProduct(**kwargs)
model.load_state_dict(torch.load(abs_model_path))
print("Model Loaded.")

dataset.tokenize()
    print("Cannot use GPU. Using CPU instead.")
print(f"Device: {device}")

# set output directory path
output_path = Path(args.output_dir)

# tensboard log and graph output folder declaration
log_tensorboard_dir = output_path / "runs" / args.word_embedding_type
writer = SummaryWriter(log_tensorboard_dir)

# load datasets
train_path = Path(args.train_path)
test_path = Path(args.test_path)
eval_path = Path(args.eval_path)

train_data = Articles(train_path)
test_data = Articles(test_path)
eval_data = Articles(eval_path, index_file=args.index_file_path)
print("Data Loaded")

# initialize tokenizer from BERT library
tokenizer = BertWordPieceTokenizer(args.tokenizer_file, lowercase=True)
print("Tokenizer Initialized!")

# create and save or load dictionaries based on arguments
if args.create_dicts:
    (
        final_word_ids,
        final_url_ids,
        final_publication_ids,
    ) = dictionary.create_merged_dictionaries(
if torch.cuda.is_available() and args.use_gpu:
    device = "cuda"
elif not args.use_gpu:
    device = "cpu"
else:
    device = "cpu"
    print("Cannot use GPU. Using CPU instead.")
print(f"Device: {device}")
print("-------------------")

# set output directory path
output_path = Path(args.output_dir)

# load in dataset
raw_data_path = Path(args.dataset_path)
raw_data = Articles(raw_data_path)
print("Data Loaded")
print("-------------------")

# load dictionaries from path
dictionary_dir = Path(args.dict_dir)
final_word_ids, final_url_ids, final_publication_ids = dictionary.load_dictionaries(dictionary_dir)
print("Dictionaries Loaded")
print("-------------------")

# map items to their dictionary values
if args.map_items:
    # tokenize data and split into words
    raw_data.tokenize()
    # map items to their ids in dictionaries and filter articles
    proper_data = raw_data.map_items(final_word_ids,
Beispiel #8
0
)
parser.add_argument(
    "--tokenizer_file", type=str, help="Designate tokenizer source file.",
)

args = parser.parse_args()
tokenizer = BertWordPieceTokenizer(args.tokenizer_file, lowercase=True)

dictionary_dir = Path(args.dict_dir)
final_word_ids, final_url_ids, final_publication_ids = dictionary.load_dictionaries(
    dictionary_dir
)
print("Dictionaries loaded.")

if args.filter:
    raw_dataset = Articles(args.dataset_path)
    print("Initial: ", len(raw_dataset))
    if args.days is not None:
        filtered_data = raw_dataset.map_items(
            tokenizer,
            final_url_ids,
            final_publication_ids,
            filter=True,
            min_length=args.min_length,
            day_range=args.days,
        )
    else:
        filtered_data = raw_dataset.map_items(
            tokenizer,
            final_url_ids,
            final_publication_ids,
    print("Cannot use GPU. Using CPU instead.")
print(f"Device: {device}")

# set output directory path
output_path = Path(args.output_dir)

# load in dataset, add easily returnable link, then create PyTorch Dataset
raw_data_path = Path(args.dataset_path)
temp_df = pd.read_json(raw_data_path)
if "link" not in temp_df.columns:
    temp_df['link'] = temp_df['url']
if "orig_title" not in temp_df.columns:
    temp_df['orig_title'] = temp_df['title']
temp_df.to_json(args.dataset_path, orient="records")

raw_data = Articles(raw_data_path)
print("Data Loaded")

# load dictionaries from path
dictionary_dir = Path(args.dict_dir)
final_word_ids, final_url_ids, final_publication_ids = dictionary.load_dictionaries(
    dictionary_dir)

# map items to their dictionary values
if args.map_items:
    raw_data.map_items(final_word_ids, final_url_ids, final_publication_ids)
    mapped_data_path = Path(args.data_dir) / "mapped-data"
    print("Mapped Data!")
    if not mapped_data_path.is_dir():
        mapped_data_path.mkdir()