def test_read_words(self): words = ["This", "is", "words,", "Anthony!"] with tempfile.TemporaryFile(mode="w+") as tmp: tmp.write(" ".join(words)) tmp.seek(0) reader = Reader(tmp) self.assertEqual(words, [w for w in reader.read_words()])
class TestReader(unittest.TestCase): def setUp(self): self.reader = Reader() self.tokenized_text = word_tokenize(TEXT) self.classified_text = self.reader.st.tag(self.tokenized_text) def test_init(self): assert TEST1 == self.classified_text def test_read_files(self): self.lst_news = self.reader.read_files("data/bbc") self.assertFalse(len(self.reader.file_names) == 0) self.assertTrue(os.access(self.reader.file_names[0], os.R_OK)) def test_parse_news(self): self.lst_news = self.reader.read_files("data/bbc") # test on a subset of news articles, e.g. 10 files res = self.reader.parse_news(self.lst_news[:10]) def test_filter_stop_words(self): example = [ 'This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.' ] res = self.reader.filter_stop_words(example) print(res) assert res == [ 'sample', 'sentence', 'showing', 'stop', 'words', 'filtration' ] def test_stem_words(self): example = ['game', 'gaming', 'gamed', 'games'] res = self.reader.stem_words(example) assert res == ['game']
def setUp(self): self.transition = [[0, 2, 3], [3, 0, 4], [3, 4, 0]] self.books = [ Book(0, 1, 5), Book(1, 0, 3), Book(2, 1, 10), Book(3, 1, 2), Book(4, 2, 8), ] self.readers = [ Reader({ 0: 3, 1: 2 }, 0, 10), Reader({ 1: 1, 2: 10, 3: 3 }, 0, 10), Reader({ 0: 1, 1: 1, 2: 1, 3: 1, 4: 1 }, 1, 10), Reader({ 2: 5, 4: 5 }, 2, 10), ] self.state = State(self.transition, self.books, self.readers)
def test_invalid_feed(self): # Assume value = 5 # Action reader = Reader() # Assert with self.assertRaises(TypeError): reader.feed(value)
def setUp(self): self.book_data = {0: 5, 1: 2, 3: 3} self.reader = Reader(books=self.book_data, location=1, max_weeks=6) self.books = [ Book(0, 1, 3), Book(1, 0, 5), Book(2, 1, 4), Book(3, 1, 2) ]
def setUp(self): self.book_data = {0: 5, 1: 2, 3: 3} self.reader = Reader(books=self.book_data, location=1, max_weeks=6) self.books = [ Book(0, 3, 1), Book(1, 5, 0), Book(2, 4, 1), Book(3, 2, 1) ]
def __init__(self, lst_news, window_size=5): """Inits Graph Args: lst_news: list of string. list of news articles. """ self.window_size = window_size self.news = list(lst_news) self.reader = Reader() self.search = Search() self.nodes = self.__create_nodes() self.edges = self.__create_edges() self.edge_weights = self.__create_weights()
def _build_bohr(self): self.bohr = TaskBohr() reader = Reader(self.file) words = deque() for index, word in enumerate(reader.read_words()): words.append((index, word)) if len(words) >= self.MAX_KEY_SIZE: self._add_word(words) words.popleft() while len(words) > 0: self._add_word(words) words.popleft()
def test_reader_creator(self): """ Create reader test """ res = create_reader(self.line.split()[1:], 12) expected = (2, Reader({0: 1, 1: 1, 2: 1, 3: 1, 4: 1}, 1, 12)) self.assertEqual(expected, res)
def post(self, *args, **kwargs): _file = self.request.files.get('file') _money = "0.00" if _file: _money = Reader.getMoney(_file[0]) self.write(json.dumps({"money": _money, "time": int(time.time())}))
def test_input_parser(self): ip = InputParser() for line in self.input_str.split("\n"): ip.parse_line(line) st = ip.get_state() trans = [[0, 2, 3], [3, 0, 4], [3, 4, 0]] books = [ Book(0, 5, 0), Book(1, 3, 0), Book(2, 10, 1), Book(3, 2, 2), Book(4, 8, 2), ] readers = [ Reader({ 0: 10, 1: 2 }, 0, 32), Reader({ 1: 1, 2: 10, 3: 3 }, 0, 32), Reader({ 0: 1, 1: 1, 2: 1, 3: 1, 4: 1 }, 1, 32), Reader({ 2: 5, 4: 5 }, 2, 32), ] expected = State(trans, books, readers) self.assertEqual(trans, st._transition) self.assertEqual(books, st._books) self.assertEqual(readers, st._readers) self.assertEqual(0, st._score) self.assertEqual(expected, st)
def nyan_filter(self, status): token = Token() reader = Reader() api = token.get_key(reader.json_dir()) print(status.text) text = status.text # for nyan in nyan_list: for nyan in open('./dictionary.txt', 'r'): nyan = nyan.replace('\n', '') print(nyan) if nyan in text: print("OUT!! Delete Tweet!! Nyan Nyan Filter Start Up!!") for tweet in tweepy.Cursor(api.user_timeline).items(): api.destroy_status(tweet.id) break; api.update_status("にゃんにゃんフィルター発動!!\n" + datetime.now().strftime("%Y/%m/%d %H:%M:%S")) else: print("No problem!!")
def get_data(date_range: DateGen, places: dict): reader = Reader() for date in date_range: for country, cities in places.items(): for city in cities: reader.get_webpage(date, country, city) reader.append_file("warsaw.csv")
def __init__(self): self.read = Reader() # self.news_list = ["Today's policy is about global warming", "Donald Trupm is the president of United States", "UCLA is the best school in southern California", "Noor Nakhaei is going to be student at UCLA", "the Boelter Hall is a dungeon", "UCLA is colaborating with Stanford", "Wenhao is meeting Trump", "Trump is in United Kingdom"] self.news_list = self.read.read_csv_file("./data/mixed-news/articles-title_only.csv") self.graph = Graph(self.news_list) self.words = self.graph.get_words() self.entities = self.graph.get_entities() self.ee_graph = EE(self.news_list) self.ec_graph = EC(self.news_list) self.cc_graph = CC(self.news_list) print("cc", self.cc_graph.get_edges()) self.kg_graph = KG(self.news_list) self.d = 10 #THIS SHOULD BE CHANGED! 4, 10, 18 self.S = pd.DataFrame(1, index=self.entities, columns=range(0, self.d)) self.T = pd.DataFrame(1, index=self.words, columns=range(0, self.d)) for i in self.S.columns: for j in self.S.index: self.S[i][j] = randint(0, 10) for i in self.T.columns: for j in self.T.index: self.T[i][j] = randint(0, 10)
class TestReader(unittest.TestCase): reader = Reader('unit-test', Queue(), Queue()) def test_parse_log_line(self): fixture_line = '199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245' formatted_line = { 'remote_host': '199.72.81.55', 'user_identity': '-', 'user_name': '-', 'datetime': datetime(1995, 7, 1, 4, 0, 1), 'request': 'GET /history/apollo/ HTTP/1.0', 'status_code': 200, 'response_size': 6245, 'section': '/history' } self.assertEqual(formatted_line, self.reader.parse_log_line(fixture_line)) fixture_line = '199.72.81.55 - jeremy [01/Jul/1995:00:01:43 +0700] "GET / HTTP/1.0" 200 7074' formatted_line = { 'remote_host': '199.72.81.55', 'user_identity': '-', 'user_name': 'jeremy', 'datetime': datetime(1995, 6, 30, 17, 1, 43), 'request': 'GET / HTTP/1.0', 'status_code': 200, 'response_size': 7074, 'section': '/' } self.assertEqual(formatted_line, self.reader.parse_log_line(fixture_line)) fixture_line = '199.72.81.55 [01/Jul/1995:00:01:43 +0700] "GET / HTTP/1.0" 200' self.assertRaises(LineFormatError, lambda: self.reader.parse_log_line(fixture_line)) def test_get_section(self): self.assertEqual('/history', self.reader.get_section('GET /history/apollo/ HTTP/1.0')) self.assertEqual('/major-history', self.reader.get_section('GET /major-history/apollo/ HTTP/1.0')) self.assertEqual('/minor.history', self.reader.get_section('GET /minor.history/apollo/ HTTP/1.0')) self.assertEqual('/', self.reader.get_section('GET /history.php HTTP/1.0')) self.assertEqual('/', self.reader.get_section('GET / HTTP/1.0')) self.assertRaises(LineFormatError, lambda: self.reader.get_section('test test')) def test_parse_datetime(self): self.assertEqual(datetime(2006, 12, 7, 18, 23, 54), self.reader.parse_datetime('07/Dec/2006:14:23:54 -0400')) self.assertRaises(IndexError, lambda: self.reader.parse_datetime('07/Dec/2006:14:23:54')) self.assertRaises(ValueError, lambda: self.reader.parse_datetime('Test test'))
for i, _file in enumerate(categories): print " " + str(i + 1) + ".", _file print category = -1 while category < 0 or category > len(categories): category = int(raw_input("Enter the category number: ")) if category != 0: category -= 1 category_dir = join(DIRECTORY, categories[category]) if len(sys.argv) > 1: _id = sys.argv[1] random_puzzle =_id + ".krk" else: cat_files = [f for f in listdir(category_dir) if isfile(join(category_dir, f))] random_puzzle = choice(cat_files) reader = Reader(join(category_dir, random_puzzle)) puzzle = KRPuzzle(reader.get_level()) try: puzzle.start() except Exception as e: print e.message puzzle.end() # puzzle.output()
def setUp(self): self.reader = Reader() self.tokenized_text = word_tokenize(TEXT) self.classified_text = self.reader.st.tag(self.tokenized_text)
def run(fp): reader = Reader(fp) reader.read_file()
def __init__(self): cmd.Cmd.__init__(self) self.prompt = "wayterm > " self.reader = Reader()
if __name__ == '__main__': try: config = ConfigLoader(DIR_NAME + '/config.ini') parameters = config.configure_threads() read_line_queue = Queue() traffic_queue = Queue() alert_content = { 'type': AlertSystem.ALERT_RECOVER_TYPE, 'to_display': False } reader = Reader(input_queue=read_line_queue, input_traffic_queue=traffic_queue, **parameters['reader']) displayer = Displayer(output_queue=read_line_queue, alert_content=alert_content, **parameters['displayer']) alert_system = AlertSystem(output_traffic_queue=traffic_queue, alert_content=alert_content, **parameters['alert_system']) has_simulator = False log_simulator = None if 'log_simulator' in parameters.keys( ) and parameters['log_simulator'] is not None: log_simulator = LogSimulator(**parameters['log_simulator']) has_simulator = True
class GUI(QtWidgets.QWidget): __reader = Reader() __viewer = Viewer() __glWidget = glWidget() __slider = QtWidgets.QSlider(QtCore.Qt.Horizontal) __draws = { "Текстурирование": __viewer.paint_texture, "Прямоугольники 2*n + 2 вершин": __viewer.paint_quadstrip, "Прямоугольники 4*n вершин": __viewer.paint_quads } __curr_draw = "Текстурирование" def __init__(self, parent=None): super().__init__(parent) self.__grid = QtWidgets.QGridLayout(self) self.__setup_main_widget() def __setup_main_widget(self): self.__glWidget.render.connect(self.__viewer.paint_texture) self.__grid.addWidget(self.__glWidget, 0, 0, 10, 2) self.__slider.sliderReleased.connect(self.__connect_value_changed) self.__grid.addWidget(self.__slider, 11, 0) self.__curr_slider = QtWidgets.QLabel() self.__curr_slider.setText("0") self.__grid.addWidget(self.__curr_slider, 11, 1) button = QtWidgets.QPushButton() button.setText("Open Tomogram") button.clicked.connect(self.__connect_open_tomogram) self.__grid.addWidget(button, 12, 0, 1, 2) draw_list = QtWidgets.QComboBox() draw_list.addItems(self.__draws.keys()) draw_list.activated[str].connect(self.__connect_chande_draw) self.__grid.addWidget(draw_list, 13, 0, 1, 2) self.__min_input = QtWidgets.QLineEdit() self.__min_input.setText("0") self.__grid.addWidget(self.__min_input, 14, 0) self.__lenght_input = QtWidgets.QLineEdit() self.__lenght_input.setText("2000") self.__grid.addWidget(self.__lenght_input, 14, 1) button = QtWidgets.QPushButton() button.setText("Set transfer parameters") button.clicked.connect(self.__connect_transfer_parameters) self.__grid.addWidget(button, 15, 0, 1, 2) button = QtWidgets.QPushButton() button.setText("Start render") button.clicked.connect(self.__start_render) self.__grid.addWidget(button, 16, 0, 1, 2) def __connect_chande_draw(self, draw_name: str): self.__glWidget.render.disconnect(self.__draws[self.__curr_draw]) self.__curr_draw = draw_name self.__glWidget.render.connect(self.__draws[self.__curr_draw]) def __connect_value_changed(self): value = self.__slider.value() self.__curr_slider.setText(str(value)) self.__viewer.set_layer(value) self.__start_render() def __connect_transfer_parameters(self): min, lenght = int(self.__min_input.text()), int( self.__lenght_input.text()) self.__viewer.set_transfer_parameters(min, lenght) def __connect_open_tomogram(self): tomogram_path = QtWidgets.QFileDialog.getOpenFileName( self, "Open Tomogram", ".")[0] if not tomogram_path: return shape, tomogram = self.__reader.Read(tomogram_path) self.__slider.setRange(0, shape[2] - 1) self.__slider.setValue(0) self.__curr_slider.setText("0") w, h = self.__glWidget.size().width(), self.__glWidget.size().height() min, lenght = int(self.__min_input.text()), int( self.__lenght_input.text()) self.__viewer.set_tomogram(shape, tomogram) self.__viewer.set_transfer_parameters(min, lenght) self.__viewer.setup_view(w, h) def __start_render(self): self.__glWidget.update()
def runf1(conn, args): # evaluation dataset # english context so that answer is in english data = MLQADataset(args.dataset, 'en', args.langQuestion) # initialize searcher init(conn, 'wiki', args) # initialise reader print("Reader") reader = Reader(model="models/distilbert-base-uncased-distilled-squad/", tokenizer="models/distilbert-uncased-my-tok") # initialise translator print("Translator") languages = {args.langQuestion, args.langSearch, 'en'} translator = Translator(languages) print("Translating between: {}".format(str(languages))) counters = {'f1': [], 'tally': 0, 'score': []} for doc in data.get(): questionSearch = translator(doc['question'], args.langQuestion, args.langSearch) #print("questionSearch ", questionSearch.encode('utf-8')) search(conn, questionSearch, args.langSearch) if args.langSearch == 'en': questionRead = questionSearch else: questionRead = translator(doc['question'], args.langQuestion, 'en') #print("questionRead ", questionRead.encode('utf-8')) # recv = {'search':[{'id':qid, 'docs':[{'context':'...', 'title':'...', 'score':score}]}] bestScore = 0 recv = recvall(conn) for n, docSearch in enumerate(recv['search'][0]['docs']): # reader answer question given contexts #print("n: ", n) #print("contextSearch ", docSearch['context'].encode('utf-8')) contextRead = translator(docSearch['context'], args.langSearch, 'en') #print("contextRead ", contextRead.encode('utf-8')) _, answerRead, score = reader(questionRead, contextRead) if score >= bestScore: bestScore = score bestAnswer = answerRead bestContext = contextRead #print("goldAnswer: ",doc['answer'].encode('utf-8')) #print("Answer: ",bestAnswer.encode('utf-8')) counters['f1'].append(f1_drqa(bestAnswer, doc['answer'])) counters['tally'] += 1 counters['score'].append(bestScore) # test if args.stop != 0 and counters['tally'] >= args.stop: print("Stoping at: ", counters['tally']) break #if i > 1: # break f1 = np.array(counters['f1']) exact_match = f1[f1 == 1.0].sum() / f1.size print("Exact match: {}".format(exact_match)) print("F1 mean: {}".format(f1.mean())) print("Mean score: {}".format(sum(counters['score']) / counters['tally'])) print("Total: {}".format(counters['tally'])) if args.save_as: print("Writing to: ", args.save_as) with open(args.save_as, "w") as fp: json.dump(counters, fp) close(conn, args.stop_server) return f1.mean()
def read(args): """reader function""" db_file = args.wiki_db_file reader_feature_file = args.reader_feature_file reader_example_file = args.reader_example_file encoder_ck_file = args.reader_encoder_ck_file downstream_ck_file = args.reader_downstream_ck_file albert_model_path = args.albert_model_path reader_result_file = args.reader_result_file seed = args.seed sp_threshold = args.sp_threshold seq_len = args.seq_len batch_size = args.reader_batch_size para_limit = args.max_para_num sent_limit = args.max_sent_num random.seed(seed) np.random.seed(seed) t1 = time() doc_db = DocDB(db_file) generator = DataGenerator(feature_file_path=reader_feature_file, example_file_path=reader_example_file, batch_size=batch_size, seq_len=seq_len, para_limit=para_limit, sent_limit=sent_limit, task_type="reader") example_dict = generator.example_dict feature_dict = generator.feature_dict answer_dict = defaultdict(lambda: defaultdict(list)) new_answer_dict = {} total_sp_dict = defaultdict(list) new_total_sp_dict = defaultdict(list) tokenizer = AlbertTokenizer.from_pretrained(albert_model_path) new_tokens = ['[q]', '[/q]', '<t>', '</t>', '[s]'] tokenizer.add_tokens(new_tokens) reader = Reader(batch_size=batch_size, encoder_ck_file=encoder_ck_file, downstream_ck_file=downstream_ck_file) print("start reading ...") for _, batch in tqdm(enumerate(generator)): input_ids = Tensor(batch["context_idxs"], mstype.int32) attn_mask = Tensor(batch["context_mask"], mstype.int32) token_type_ids = Tensor(batch["segment_idxs"], mstype.int32) context_mask = Tensor(batch["context_mask"], mstype.float32) square_mask = Tensor(batch["square_mask"], mstype.float32) packing_mask = Tensor(batch["query_mapping"], mstype.float32) para_start_mapping = Tensor(batch["para_start_mapping"], mstype.float32) sent_end_mapping = Tensor(batch["sent_end_mapping"], mstype.float32) unique_ids = batch["unique_ids"] sent_names = batch["sent_names"] cache_mask = Tensor( np.tril(np.triu(np.ones((seq_len, seq_len)), 0), 30), mstype.float32) _, _, q_type, _, sent_logit, y1, y2 = reader( input_ids, attn_mask, token_type_ids, context_mask, square_mask, packing_mask, cache_mask, para_start_mapping, sent_end_mapping) type_prob = ops.Softmax()(q_type).asnumpy() answer_dict_ = convert_to_tokens(example_dict, feature_dict, batch['ids'], y1.asnumpy().tolist(), y2.asnumpy().tolist(), type_prob, tokenizer, sent_logit.asnumpy(), sent_names, unique_ids) for q_id in answer_dict_: answer_dict[q_id] = answer_dict_[q_id] for q_id in answer_dict: res = answer_dict[q_id] answer_text_ = res[0] sent_ = res[1] sent_names_ = res[2] new_answer_dict[q_id] = answer_text_ predict_support_np = ops.Sigmoid()(Tensor(sent_, mstype.float32)).asnumpy() for j in range(predict_support_np.shape[0]): if j >= len(sent_names_): break if predict_support_np[j] > sp_threshold: total_sp_dict[q_id].append(sent_names_[j]) for _id in total_sp_dict: _sent_names = total_sp_dict[_id] for para in _sent_names: title = make_wiki_id(para[0], 0) para_original_title = doc_db.get_doc_info(title)[-1] para[0] = para_original_title new_total_sp_dict[_id].append(para) prediction = {'answer': new_answer_dict, 'sp': new_total_sp_dict} with open(reader_result_file, 'w') as f: json.dump(prediction, f, indent=4) t2 = time() print(f"reader cost time: {t2-t1} s")
def json_decode(json_text): return parse(Reader(json_text))
class ReaderTest(unittest.TestCase): """ Reader Test """ def setUp(self): self.book_data = {0: 5, 1: 2, 3: 3} self.reader = Reader(books=self.book_data, location=1, max_weeks=6) self.books = [ Book(0, 3, 1), Book(1, 5, 0), Book(2, 4, 1), Book(3, 2, 1) ] def test_creation(self): """ Tests the creation of a reader. """ self.assertEqual(self.book_data, self.reader._books) self.assertEqual(1, self.reader._location) self.assertEqual([0, 0, 0, 0, 0, 0], self.reader._timing) def test_read(self): """ Test simple read book 0. """ self.reader._timing = [3, 2, 2, 1, 1, 1] score, time = self.reader.read(self.books[0], 0, 3) self.assertTrue(self.books[0].id_book not in self.reader._books) self.assertEqual(3, score) self.assertEqual(6, time) self.assertEqual([3, 3, 3, 2, 2, 2], self.reader._timing) score, time = self.reader.read(self.books[3], 4, 5) self.assertTrue(self.books[3].id_book not in self.reader._books) self.assertEqual(0, score) self.assertEqual(7, time) self.assertEqual([3, 3, 3, 2, 3, 3], self.reader._timing) def test_another_read(self): """ Test complex read book 0. """ self.reader._timing = [3, 2, 3, 3, 1, 1] score, time = self.reader.read(self.books[0], 0, 6) self.assertTrue(self.books[0].id_book not in self.reader._books) self.assertEqual(0, score) self.assertEqual(9, time) def test_read_not_interested(self): """ Test if Reader is not interested in books """ try: self.reader.read(self.books[2], 0, 0) self.fail("Readed book not interested") except AssertionError: pass def test_read_different_location(self): """ Test if Book is in different location. """ try: self.reader.read(self.books[1], 0, 1) self.fail("Readed book was not in the library") except AssertionError: pass
# now, try from the right to the left. mark_group_i = len(row) for i, group in list(enumerate(groups))[-1::-1]: if i in used_groups: continue group_type = types[i] if group_type == self.NULL: # if the type is self.NULL quit trying. break elif group_type == self.MARK: mark_group_i -= 1 if mark_group_i >= 0 and row[mark_group_i] == group: indices.add(mark_group_i) return indices if __name__ == '__main__': if len(sys.argv) == 1: reader = Reader("levels/lambda.krk") else: reader = Reader(sys.argv[1]) _level = reader.get_level() puzzle = KRPuzzle(_level) puzzle.start()
class Graph(object): """ Base class which represents the heterogeneous textual graph (undirected graph). G = <V, E>. V is the set of nodes (objects), including 3 types of objects (i.e. new entites, known entities, and contextual words). Entities are words (with label: PERSON, LOCATION, and ORGANIZATION) whereas contextual words are the remaining uni-gram words. New entities are the entities not in DBpedia, and Know entities are the entities in the DBpedia. E is a set of edges (co-occurrences) of entity-entity, entity-word, and word-word corrences. Words within every 5-word sliding window in a news sentence are considered to be co-occuring with each other. The weights are represented by adjacency matrix using dataframe. Attributes: nodes: dictionary of nodes {"N (new entity)": [(word, label)], "K (Known entity)": [(word, label)], "C (Contextual word)": [(word, label)]} in the graph; Includes 3 types of objects (i.e. e new entites, known entities, and contextual words). edges: set contains the tuples. e.g. ("A", "B") indicates a link between node "A" and node "B". weights: The weights are represented by adjacency matrix using dataframe. news: list of news articles (articles are string type). """ def __init__(self, lst_news, window_size=5): """Inits Graph Args: lst_news: list of string. list of news articles. """ self.window_size = window_size self.news = list(lst_news) self.reader = Reader() self.search = Search() self.nodes = self.__create_nodes() self.edges = self.__create_edges() self.edge_weights = self.__create_weights() def __create_nodes(self): """Private class method Takes in a list of news articles (articles are string types): 1) tokenize the articles 2) remove stopwords 3) label words with 3 labels (i.e. PERSON, ORGANIZATION, LOCATION) 4) Match entities (i.e. person, org, loc) against DBpedia Returns: Returns a dictionary contains 3 types of objects (i.e. new entites, known entities, and contextual words). E.g. {"N": [("Washington", "LOCATION")], "K":[("Trump", "PERSON"), ("Hua Wei", "ORGANIZATION")], "C": [("the", "O"), ("am", "O")]} """ # parse news articles tagged_words = self.reader.parse_news(self.news) # seperate entities from contextual words entities, cwords = self.__entities_words(tagged_words) new_e, known_e = self.search.query(entities) ret = dict() ret["N"] = list(set(new_e)) ret["K"] = list(set(known_e)) ret["C"] = list(set(cwords)) return dict(ret) def get_nodes(self): """ Getter method which returns all nodes from self.nodes. """ ret = set() for i in self.nodes["N"]: ret.add(i[0]) for i in self.nodes["K"]: ret.add(i[0]) for i in self.nodes["C"]: ret.add(i[0]) return list(ret) def get_entities(self): """ Getter method which returns a list of entities (i.e. word tagged with "PERSON", "LOCATION", "ORGANIZATION") from self.nodes. """ ret = set() for i in self.nodes["N"]: ret.add(i[0]) for i in self.nodes["K"]: ret.add(i[0]) return list(ret) def get_words(self): """ Getter method which returns a list of contextual words from self.nodes. """ ret = set() for i in self.nodes["C"]: ret.add(i[0]) return list(ret) def __create_edges(self, window_size=5): """Private class method Takes in a list of news articles, and extract the co-occurring links between nodes. Nodes within 5-word sliding window in a news sentence are considered to be co-occuring with each other. The frequncies of nodes co-appearing in news sentences as weights of these links. Returns: Returns a set of links between nodes. """ e = set() for article in self.news: self.tokenized_text = word_tokenize(article) self.tokenized_text = self.reader.filter_stop_words( self.tokenized_text) generator = self.sliding_window(self.tokenized_text, self.window_size) for t in generator: e = e.union(set(itertools.combinations(t, 2))) return set(e) def get_edges(self): """ Getter method which returns a set of edges from self.edges. """ return set(self.edges) def sliding_window(self, seq, n=5): """ Returns a sliding window (of width n) over data from the iterable s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... Args: seq: list of words; This is one news article splitted into a list of words. n: int; size of the sliding window Returns: An iterator contains all the sliced window. See the test case in `tests/test_graph.py` for more details. """ it = iter(seq) result = tuple(islice(it, n)) if len(result) <= n: yield result for elem in it: result = result[1:] + (elem, ) yield result def __create_weights(self): """Private class method Create weights matrix using pandas dataframe. The value at ith row and jth row is the counts of links (undirected) between node i and node j. Returns: Return a copy of dataframe representing the weights matrix. """ words = self.get_nodes() df = pd.DataFrame(index=words, columns=words).fillna(0) for article in self.news: self.tokenized_text = word_tokenize(article) self.tokenized_text = self.reader.filter_stop_words( self.tokenized_text) generator = self.sliding_window(self.tokenized_text, self.window_size) for t in generator: for tup in set(itertools.combinations(t, 2)): if tup[0] != tup[1]: df.loc[tup[0], tup[1]] += 1 df.loc[tup[1], tup[0]] += 1 return df.copy() def get_weights(self): return self.edge_weights.copy() def __entities_words(self, tagged_words): """Private class method Seperate the entity words from the comtextual words. Args: tagged_words: list of strings; a list of tuples (word, label) Returns: entities: words tagged with "PERSON", "LOCATION", "ORGANIZATION". cwords: words tagged with "O" """ entities = list() cwords = list() for word in tagged_words: if word[1] == "O": # contextual words cwords.append(word) else: entities.append(word) assert len(entities) + len(cwords) == len(tagged_words) return entities, cwords def update_weight(self, e, w): """ Update the edge weight in the enternal weight matrix. Args: e: tuple; a tuple contains two nodes, e.g. ("A", "B") w: int; The new weight associated with e """ if e[0] not in set(self.get_nodes()): raise ValueError("Node {} is not in the graph".format(str(e[0]))) if e[1] not in set(self.get_nodes()): raise ValueError("Node {} is not in the graph".format(str(e[1]))) if e in self.edges and w <= 0: self.edge_weights.loc[e[0], e[1]] = w self.edge_weights.loc[e[1], e[0]] = w self.edges.remove(e) self.edges.remove((e[1], e[0])) elif e in self.edges and w > 0: self.edge_weights.loc[e[0], e[1]] = w self.edge_weights.loc[e[1], e[0]] = w else: self.edge_weights.loc[e[0], e[1]] = w self.edge_weights.loc[e[1], e[0]] = w self.edges.add(e)
def __init__(self, f, start, goal): self.reader = Reader(f) self.start = tuple(map(int, start.split(','))) self.goal = tuple(map(int, goal.split(','))) self.expanded = []
class Embedding(object): """ Python class which produces the joint embedding of the words and entities. To use it: E = Embedding(), entity_embedding, word_embedding = E.joint_embedding() Attributes: kg_graph: The knowledge Graph ee_graph: A heterogeneous subgraph of HEER, showing relations between entities cc_graph: A heterogeneous subgraph of HEER, showing relations between words ec_graph: A bipartite subgraph of HEER, showing relations between entities and words """ def __init__(self): self.read = Reader() # self.news_list = ["Today's policy is about global warming", "Donald Trupm is the president of United States", "UCLA is the best school in southern California", "Noor Nakhaei is going to be student at UCLA", "the Boelter Hall is a dungeon", "UCLA is colaborating with Stanford", "Wenhao is meeting Trump", "Trump is in United Kingdom"] self.news_list = self.read.read_csv_file("./data/mixed-news/articles-title_only.csv") self.graph = Graph(self.news_list) self.words = self.graph.get_words() self.entities = self.graph.get_entities() self.ee_graph = EE(self.news_list) self.ec_graph = EC(self.news_list) self.cc_graph = CC(self.news_list) print("cc", self.cc_graph.get_edges()) self.kg_graph = KG(self.news_list) self.d = 10 #THIS SHOULD BE CHANGED! 4, 10, 18 self.S = pd.DataFrame(1, index=self.entities, columns=range(0, self.d)) self.T = pd.DataFrame(1, index=self.words, columns=range(0, self.d)) for i in self.S.columns: for j in self.S.index: self.S[i][j] = randint(0, 10) for i in self.T.columns: for j in self.T.index: self.T[i][j] = randint(0, 10) def weighted_sample(self, items, n): """ This function samples an item, proportional to it's weight attribute. Args: items: the list of edges we should choose between them. n: number of edges we should choose. Returns: Yields the chosen edge, proportional to it' weight """ total = 0 for j in items: total = float(sum(w for a, b, w in items)) i = 0 a, b, w = items[0] while n: x = total * (1 - random.random() ** (1.0 / n)) total -= x while x > w: x -= w i += 1 a, b, w = items[i] w -= x yield a, b n -= 1 def embedding_update(self, s, t, g, k=3): """ This function updates the embeddings of words and entitites. Args: s: A binary number, indicting the type of embedding that should be updated. t: A binary number, indicting the type of embedding that should be updated. g: The graph; It could be the ee, cc, or ec subgraph, or the kg graph. k: Number of negative edges. """ eta = 0.2 # Sample an edge from G and draw k negative edges # and I guess, when we sample an edge, we also update that node's weight in the embedding! # So for sampling I should have all the weights, df = g.get_weights() num_cols = g.get_nodes() edges = [] for i in num_cols: for j in num_cols: if df[i][j] != 0: edge = [] edge.append(i) edge.append(j) edge.append(df[i][j]) edges.append(edge) sampled_edge = self.weighted_sample(edges, 1) for el in sampled_edge: sampled_node_a = el[0] sampled_node_b = el[1] #swap! if s == 1 and t == 1: print(sampled_node_a) print(sampled_node_b) if sampled_node_a in self.S.index: s1 = sampled_node_b sampled_node_b = sampled_node_a sampled_node_a = s1 # sampled_neg_nodes = [] if s == 1 and t == 1: nodes = g.get_entities() else: nodes = g.get_nodes() # draw k negative edges! sampled_neg_nodes = random.sample(nodes, k) # [k] sampled_neg_nodes.append(sampled_node_b) #so up until here, we have k negative edges, one positive edge, the graph, and S_t, T_t if s == 1 and t == 1: # S, T, G_ec sum = 0 for i in range(k+1): a = np.dot(self.S.loc[sampled_neg_nodes[i]], self.T.loc[sampled_node_a]) if a > 123: a = 123 elif a < 0.1: a = 0.5 b = np.exp(a) sum = sum + b c = np.log(sum) d = self.S.loc[sampled_node_b].T e = - eta * d * c self.T.loc[sampled_node_a] = self.T.loc[sampled_node_a] - e sum = 0 for i in range(k+1): a = np.dot(self.S.loc[sampled_neg_nodes[i]], self.T.loc[sampled_node_a]) if a > 123: a = 123 elif a < 0.1: a = 0.5 b = np.exp(a) sum = sum + b c = np.log(sum) d = self.T.loc[sampled_node_a].T e = - eta * d * c self.S.loc[sampled_node_b] = self.S.loc[sampled_node_b] - e elif s == 0 and t == 1: # T, T, G_cc sum = 0 for i in range(k+1): a = np.dot(self.T.loc[sampled_neg_nodes[i]], self.T.loc[sampled_node_a]) if a > 123: a = 123 elif a < 0.1: a = 0.5 b = np.exp(a) sum = sum + b c = np.log(sum) d = self.T.loc[sampled_node_b].T e = - eta * d * c self.T.loc[sampled_node_a] = self.T.loc[sampled_node_a] - e elif s == 1 and t == 0: # S, S, G_ee sum = 0 for i in range(k+1): a = np.dot(self.S.loc[sampled_neg_nodes[i]], self.S.loc[sampled_node_a]) if a > 123: a = 123 elif a < 0.1: a = 0.5 b = np.exp(a) sum = sum + b c = np.log(sum) d = self.S.loc[sampled_node_a].T e = - eta * d * c self.S.loc[sampled_node_b] = self.S.loc[sampled_node_b] - e def joint_embedding(self): """ This function runs the iteration to minimize the cost function, and calls the update function.. Attributes: theta: The guiding parameter, chosen empirically. The bigger it is, the more effective the kg graph is. k: Number of negatve smaples. t: Number of iterations. Returns: Returns two dataframes, first the entitiy embedding(normalized_S) and second the word embedding(normalized_T). """ # the guiding parameter, which we should have empirically, the bigger it is, the more we are relying to our kg graph. theta = 0.5 #THIS SHOULD BE CHANGED! 0.2, 0.5, 0.7 # number of negative samplings k = 2 # number of iterations t = 100 # the loop of the algorithm while t > 0: gamma = random.uniform(0, 1) if gamma <= theta: self.embedding_update(1, 0, self.kg_graph, k) else: self.embedding_update(1, 1, self.ec_graph, k) self.embedding_update(1, 0, self.ee_graph, k) self.embedding_update(0, 1, self.cc_graph, k) t = t - 1 normalized_S=self.S.div(self.S.sum(axis=1), axis=0) normalized_T=self.T.div(self.T.sum(axis=1), axis=0) return normalized_S, normalized_T
indexer.createIndex() if args.query != None: #if not os.path.isfile(idxfile): # raise Exception("Could not find indexfile: {}".format(idxfile)) if args.analyzer == None or args.language == 'all': raise ValueError( "To retrieve query you must specify analyzer and language") searcher = Searcher(index_path=args.index, lang=args.language, analyzer=args.analyzer, dataset=args.dataset) searcher.queryTest(args.query) if args.run == 'reader': reader = Reader() reader.run(lang=args.lang, analyzer=args.analyzer, dataset=args.dataset) if args.metric == 'dist': metrics.hits(dataset=args.dataset, langContext=args.language, langQuestion=args.language, distant=True, k=50) if args.metric == 'hit@k': metrics.hits(dataset=args.dataset, langContext=args.language, langQuestion=args.language, distant=False,
from src.log_simulator import LogSimulator from time import time from queue import Queue DIR_NAME = os.path.dirname(os.path.abspath(__file__)) if __name__ == '__main__': read_line_queue = Queue() traffic_queue = Queue() alert_content = { 'type': AlertSystem.ALERT_RECOVER_TYPE, 'to_display': False } reader = Reader(DIR_NAME + '/data/access-log.log', read_line_queue, traffic_queue) displayer = Displayer(read_line_queue, alert_content, 10, True) alert_system = AlertSystem(80, traffic_queue, alert_content, 120) log_simulator = LogSimulator(DIR_NAME + '/data/access-log.log', 'localhost', ['/', '/section1']) current_time = time() log_simulator.start() reader.start() displayer.start() alert_system.start() while time() - current_time <= 120: log_simulator.resume() reader.resume()
def test_success_reader(self): # Assume html = '<HTML><HEAD><TITLE>Ejemplo 2</TITLE></HEAD><BODY></BODY></HTML>' # Action reader = Reader() reader.feed(html) metrics = reader.get_metrics() # Assert self.assertEqual(metrics['total_elements'], 4) # Assume html2 = '<HTML><HEAD><IMG /><TITLE>Ejemplo 2</TITLE></HEAD><BODY></BODY></HTML>' # Action reader2 = Reader() reader2.feed(html2) metrics2 = reader2.get_metrics() # Assert self.assertEqual(metrics2['total_elements'], 5) # Assume html3 = '<HTML><HEAD><BADTAG><TITLE>Ejemplo 2</TITLE></HEAD><BADTAG><BODY></BODY></HTML>' # Action reader3 = Reader() reader3.feed(html3) metrics3 = reader3.get_metrics() # Assert self.assertEqual(metrics3['total_elements'], 4)
class Console(cmd.Cmd): def __init__(self): cmd.Cmd.__init__(self) self.prompt = "wayterm > " self.reader = Reader() def do_hist(self, args): """Print a list of commands that have been entered""" print self._hist def do_exit(self, args): """Exits from the console""" return -1 def do_EOF(self, args): """Exit on system end of file character""" return self.do_exit(args) def do_shell(self, args): """Pass command to a system shell when line begins with '!'""" os.system(args) def do_help(self, args): """Get help on commands 'help' or '?' with no arguments prints a list of commands for which help is available 'help <command>' or '? <command>' gives help on <command> """ ## The only reason to define this method is for the help text in the doc string self.reader.printfile('help') def preloop(self): """Initialization before prompting user for commands. Despite the claims in the Cmd documentaion, Cmd.preloop() is not a stub. """ cmd.Cmd.preloop(self) ## sets up command completion self._hist = [] ## No history yet self._locals = {} ## Initialize execution namespace for user self._globals = {} def postloop(self): """Take care of any unfinished business. Despite the claims in the Cmd documentaion, Cmd.postloop() is not a stub. """ cmd.Cmd.postloop(self) ## Clean up command completion print "Exiting..." def precmd(self, line): """ This method is called after the line has been input but before it has been interpreted. If you want to modifdy the input line before execution (for example, variable substitution) do it here. """ self._hist += [ line.strip() ] return line def postcmd(self, stop, line): """If you want to stop the console, return something that evaluates to true. If you want to do some post command processing, do it here. """ return stop def emptyline(self): """Do nothing on empty input line""" pass def default(self, line): """Called on an input line when the command prefix is not recognized. In that case we execute the line as Python code. """ try: wayterm.call(line.split('\\')) except Exception, e: print e.__class__, ":", e
def parse_fun(json_text): return parse(Reader(json_text))
#!/usr/bin/python3 from src.reader import Reader import _thread import argparse from generate_logs import generate_logs from src.constants import DEFAULT_OUTPUT_FILE import sys sys.path.append("./src") if __name__ == '__main__': parser = argparse.ArgumentParser(__file__, description="Log Generator") parser.add_argument("--generate", "-g", dest="file_generate", help="Output file path", type=str) parser.add_argument("--file", "-f", dest="file_read", help="read file path", type=str) parser.add_argument("--threshold", "-t", dest="threshold", help="alerting thresholds", type=str) args = parser.parse_args() file_read = DEFAULT_OUTPUT_FILE if not args.file_read else args.file_read file_generate = args.file_generate threshold = args.threshold r = Reader() try: _thread.start_new_thread(generate_logs, (file_generate,)) _thread.start_new_thread(r.read_lines, (file_read,)) except: print("Error: unable to start thread") while 1: pass