def _load_kw_trees(self) -> List[KeywordTree]: """ Загружает префиксные деревья для терминов из словарей (название каждого файла соответствует количеству токенов в терминах этого файла :return: Список префиксных деревьев """ fnames = [ '1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt', '8.txt', '9.txt', '10.txt', '11.txt', '12.txt', '13.txt', '14.txt', '20.txt' ] files_dir_path = os.path.join(DICT_EXTRACTOR_PATH, TERMS_DIR_NAME) kw_trees = [] for fname in fnames[::-1]: kwtree = KeywordTree() with open(os.path.join(files_dir_path, fname), 'r') as f: for ngramm in f.read().split('\n'): if ngramm != '': kwtree.add(ngramm.split()) kwtree.finalize() kw_trees.append(kwtree) return kw_trees
class Monitor(threading.Thread): """Continously scan for BLE advertisements.""" def __init__(self, callback, bt_device_id, device_filter, packet_filter, scan_parameters): """Construct interface object.""" # do import here so that the package can be used in parsing-only mode (no bluez required) self.backend = import_module('beacontools.backend') threading.Thread.__init__(self) self.daemon = False self.keep_going = True self.callback = callback # number of the bt device (hciX) self.bt_device_id = bt_device_id # list of beacons to monitor self.device_filter = device_filter self.mode = get_mode(device_filter) # list of packet types to monitor self.packet_filter = packet_filter # bluetooth socket self.socket = None # keep track of Eddystone Beacon <-> bt addr mapping self.eddystone_mappings = [] # parameters to pass to bt device self.scan_parameters = scan_parameters # construct an aho-corasick search tree for efficient prefiltering service_uuid_prefix = b"\x03\x03" self.kwtree = KeywordTree() if self.mode & ScannerMode.MODE_IBEACON: self.kwtree.add( bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) + IBEACON_MANUFACTURER_ID + IBEACON_PROXIMITY_TYPE) if self.mode & ScannerMode.MODE_EDDYSTONE: self.kwtree.add(service_uuid_prefix + EDDYSTONE_UUID) if self.mode & ScannerMode.MODE_ESTIMOTE: self.kwtree.add(service_uuid_prefix + ESTIMOTE_UUID) self.kwtree.add( bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) + ESTIMOTE_MANUFACTURER_ID) if self.mode & ScannerMode.MODE_CJMONITOR: self.kwtree.add( bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) + CJ_MANUFACTURER_ID) if self.mode & ScannerMode.MODE_EXPOSURE_NOTIFICATION: self.kwtree.add(service_uuid_prefix + EXPOSURE_NOTIFICATION_UUID) self.kwtree.finalize() def run(self): """Continously scan for BLE advertisements.""" self.socket = self.backend.open_dev(self.bt_device_id) self.set_scan_parameters(**self.scan_parameters) self.toggle_scan(True) while self.keep_going: pkt = self.socket.recv(255) event = to_int(pkt[1]) subevent = to_int(pkt[3]) if event == LE_META_EVENT and subevent == EVT_LE_ADVERTISING_REPORT: # we have an BLE advertisement self.process_packet(pkt) self.socket.close() def set_scan_parameters(self, scan_type=ScanType.ACTIVE, interval_ms=10, window_ms=10, address_type=BluetoothAddressType.RANDOM, filter_type=ScanFilter.ALL): """"sets the le scan parameters Args: scan_type: ScanType.(PASSIVE|ACTIVE) interval: ms (as float) between scans (valid range 2.5ms - 10240ms) ..note:: when interval and window are equal, the scan runs continuos window: ms (as float) scan duration (valid range 2.5ms - 10240ms) address_type: Bluetooth address type BluetoothAddressType.(PUBLIC|RANDOM) * PUBLIC = use device MAC address * RANDOM = generate a random MAC address and use that filter: ScanFilter.(ALL|WHITELIST_ONLY) only ALL is supported, which will return all fetched bluetooth packets (WHITELIST_ONLY is not supported, because OCF_LE_ADD_DEVICE_TO_WHITE_LIST command is not implemented) Raises: ValueError: A value had an unexpected format or was not in range """ interval_fractions = interval_ms / MS_FRACTION_DIVIDER if interval_fractions < 0x0004 or interval_fractions > 0x4000: raise ValueError( "Invalid interval given {}, must be in range of 2.5ms to 10240ms!" .format(interval_fractions)) window_fractions = window_ms / MS_FRACTION_DIVIDER if window_fractions < 0x0004 or window_fractions > 0x4000: raise ValueError( "Invalid window given {}, must be in range of 2.5ms to 10240ms!" .format(window_fractions)) interval_fractions, window_fractions = int(interval_fractions), int( window_fractions) scan_parameter_pkg = struct.pack("<BHHBB", scan_type, interval_fractions, window_fractions, address_type, filter_type) self.backend.send_cmd(self.socket, OGF_LE_CTL, OCF_LE_SET_SCAN_PARAMETERS, scan_parameter_pkg) def toggle_scan(self, enable, filter_duplicates=False): """Enables or disables BLE scanning Args: enable: boolean value to enable (True) or disable (False) scanner filter_duplicates: boolean value to enable/disable filter, that omits duplicated packets""" command = struct.pack("BB", enable, filter_duplicates) self.backend.send_cmd(self.socket, OGF_LE_CTL, OCF_LE_SET_SCAN_ENABLE, command) def process_packet(self, pkt): """Parse the packet and call callback if one of the filters matches.""" payload = pkt[14:-1] # check if this could be a valid packet before parsing # this reduces the CPU load significantly if not self.kwtree.search(payload): return bt_addr = bt_addr_to_string(pkt[7:13]) rssi = bin_to_int(pkt[-1]) # strip bluetooth address and parse packet packet = parse_packet(payload) # return if packet was not an beacon advertisement if not packet: return # we need to remeber which eddystone beacon has which bt address # because the TLM and URL frames do not contain the namespace and instance self.save_bt_addr(packet, bt_addr) # properties holds the identifying information for a beacon # e.g. instance and namespace for eddystone; uuid, major, minor for iBeacon properties = self.get_properties(packet, bt_addr) if self.device_filter is None and self.packet_filter is None: # no filters selected self.callback(bt_addr, rssi, packet, properties) elif self.device_filter is None: # filter by packet type if is_one_of(packet, self.packet_filter): self.callback(bt_addr, rssi, packet, properties) else: # filter by device and packet type if self.packet_filter and not is_one_of(packet, self.packet_filter): # return if packet filter does not match return # iterate over filters and call .matches() on each for filtr in self.device_filter: if isinstance(filtr, BtAddrFilter): if filtr.matches({'bt_addr': bt_addr}): self.callback(bt_addr, rssi, packet, properties) return elif filtr.matches(properties): self.callback(bt_addr, rssi, packet, properties) return def save_bt_addr(self, packet, bt_addr): """Add to the list of mappings.""" if isinstance(packet, EddystoneUIDFrame): # remove out old mapping new_mappings = [ m for m in self.eddystone_mappings if m[0] != bt_addr ] new_mappings.append((bt_addr, packet.properties)) self.eddystone_mappings = new_mappings def get_properties(self, packet, bt_addr): """Get properties of beacon depending on type.""" if is_one_of(packet, [EddystoneTLMFrame, EddystoneURLFrame, \ EddystoneEncryptedTLMFrame, EddystoneEIDFrame]): # here we retrieve the namespace and instance which corresponds to the # eddystone beacon with this bt address return self.properties_from_mapping(bt_addr) else: return packet.properties def properties_from_mapping(self, bt_addr): """Retrieve properties (namespace, instance) for the specified bt address.""" for addr, properties in self.eddystone_mappings: if addr == bt_addr: return properties return None def terminate(self): """Signal runner to stop and join thread.""" self.toggle_scan(False) self.keep_going = False self.join()
# arr = [95, 1, 5, 67, 223, 566, 33, 67, 13, 1, 5, 67] # 0 = bear # 1 or 2 = bull day_selector = 1 c_open = float(data[day_selector][2]) c_high = float(data[day_selector][3]) c_low = float(data[day_selector][4]) c_close = float(data[day_selector][5]) # print("Open Ref: " + str(c_open) + "\n") print(build_candle_id(c_close, c_open, c_high, c_low)) kwtree = KeywordTree(case_insensitive=True) for index in range(len(data)): c_open = float(data[index][2]) c_high = float(data[index][3]) c_low = float(data[index][4]) c_close = float(data[index][5]) kwtree.add(build_candle_id(c_close, c_open, c_high, c_low)) kwtree.finalize() results = kwtree.search('S-22.77-51.3-25.93-72') res_count = 0 for result in results: res_count += 1 print(result)
import random import pathlib from termcolor import colored U = "CTAGTTAG" V = "bvbccvCTAnGTTAGvfqvsdqvqCTAGTTAcGvfdCTACGATAGvvfGTTGTTfdvCTAtggAGsfsdfdCTAdddddddddddAGvbcvbcvb" print("\n", "Text : ", V) print("Motif :", U) erreur = input("erreur : ") #Pi=textwrap.wrap(U, int(erreur)) Pi = [U[i:i + int(erreur)] for i in range(0, len(U), int(erreur))] print(Pi) #Aho-Corasick recherch kwtree = KeywordTree(case_insensitive=True) for i in range(0, len(Pi)): kwtree.add(Pi[i]) kwtree.finalize() results = kwtree.search_all(V) #afichage de tout les occurence Vals = [] Keyz = [] for result in results: #print(result) Vals.append(result[0]) Keyz.append(result[1]) dictionary = dict(zip(Keyz, Vals)) print(dictionary, "\n")
import numpy as np import pandas as pd import csv from ahocorapy.keywordtree import KeywordTree # Initializing Brand Names brandNamesFile = open('brandNames.csv', 'r') reader = csv.reader(brandNamesFile) allRows = [row for row in reader] brandNames = [] for item in allRows: brandNames.append(item[0]) # Initializing ahocorasick search kwtree = KeywordTree(case_insensitive=True) for brand in brandNames: kwtree.add(brand) kwtree.finalize() # Initializing praw reddit client client_id = input('client_id: ') client_secret = input('client_secret: ') username = input('username: '******'password: '******'WAYWT Stats Scraper',
def test_visualizer(self): # Needs working pygraphviz on system kwtree = KeywordTree(case_insensitive=True) kwtree.add('malaga') kwtree.add('lacrosse') kwtree.add('mallorca') kwtree.add('mallorca bella') kwtree.add('orca') kwtree.finalize() visualizer = Visualizer() visualizer.draw('readme_example.png', kwtree)
class Monitor(threading.Thread): """Continously scan for BLE advertisements.""" def __init__(self, callback, bt_device_id, device_filter, packet_filter, scan_parameters): """Construct interface object.""" # do import here so that the package can be used in parsing-only mode (no bluez required) self.backend = import_module('beacontools.backend') threading.Thread.__init__(self) self.daemon = False self.keep_going = True self.callback = callback # number of the bt device (hciX) self.bt_device_id = bt_device_id # list of beacons to monitor self.device_filter = device_filter self.mode = get_mode(device_filter) # list of packet types to monitor self.packet_filter = packet_filter # bluetooth socket self.socket = None # keep track of Eddystone Beacon <-> bt addr mapping self.eddystone_mappings = [] # parameters to pass to bt device self.scan_parameters = scan_parameters # hci version self.hci_version = HCIVersion.BT_CORE_SPEC_1_0 # construct an aho-corasick search tree for efficient prefiltering service_uuid_prefix = b"\x03\x03" self.kwtree = KeywordTree() if self.mode & ScannerMode.MODE_IBEACON: self.kwtree.add(bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) + IBEACON_MANUFACTURER_ID + IBEACON_PROXIMITY_TYPE) if self.mode & ScannerMode.MODE_EDDYSTONE: self.kwtree.add(service_uuid_prefix + EDDYSTONE_UUID) if self.mode & ScannerMode.MODE_ESTIMOTE: self.kwtree.add(service_uuid_prefix + ESTIMOTE_UUID) self.kwtree.add(bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) + ESTIMOTE_MANUFACTURER_ID) if self.mode & ScannerMode.MODE_CJMONITOR: self.kwtree.add(bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) + CJ_MANUFACTURER_ID) if self.mode & ScannerMode.MODE_EXPOSURE_NOTIFICATION: self.kwtree.add(service_uuid_prefix + EXPOSURE_NOTIFICATION_UUID) self.kwtree.finalize() def run(self): """Continously scan for BLE advertisements.""" self.socket = self.backend.open_dev(self.bt_device_id) self.hci_version = self.get_hci_version() self.set_scan_parameters(**self.scan_parameters) self.toggle_scan(True) while self.keep_going: pkt = self.socket.recv(255) event = to_int(pkt[1]) subevent = to_int(pkt[3]) if event == LE_META_EVENT and subevent in [EVT_LE_ADVERTISING_REPORT, EVT_LE_EXT_ADVERTISING_REPORT]: # we have an BLE advertisement self.process_packet(pkt) self.socket.close() def get_hci_version(self): """Gets the HCI version""" local_version = Struct( "status" / Byte, "hci_version" / Byte, "hci_revision" / Bytes(2), "lmp_version" / Byte, "manufacturer_name" / Bytes(2), "lmp_subversion" / Bytes(2), ) try: resp = self.backend.send_req(self.socket, OGF_INFO_PARAM, OCF_READ_LOCAL_VERSION, EVT_CMD_COMPLETE, local_version.sizeof(), bytes(), 0) return HCIVersion(GreedyRange(local_version).parse(resp)[0]["hci_version"]) except (ConstructError, NotImplementedError): return HCIVersion.BT_CORE_SPEC_1_0 def set_scan_parameters(self, scan_type=ScanType.ACTIVE, interval_ms=10, window_ms=10, address_type=BluetoothAddressType.RANDOM, filter_type=ScanFilter.ALL): """"Sets the le scan parameters For extended set scan parameters command additional parameter scanning PHYs has to be provided. The parameter indicates the PHY(s) on which the advertising packets should be received on the primary advertising physical channel. For further information have a look on BT Core 5.1 Specification, page 1439 ( LE Set Extended Scan Parameters command). Args: scan_type: ScanType.(PASSIVE|ACTIVE) interval: ms (as float) between scans (valid range 2.5ms - 10240ms or 40.95s for extended version) ..note:: when interval and window are equal, the scan runs continuos window: ms (as float) scan duration (valid range 2.5ms - 10240ms or 40.95s for extended version) address_type: Bluetooth address type BluetoothAddressType.(PUBLIC|RANDOM) * PUBLIC = use device MAC address * RANDOM = generate a random MAC address and use that filter: ScanFilter.(ALL|WHITELIST_ONLY) only ALL is supported, which will return all fetched bluetooth packets (WHITELIST_ONLY is not supported, because OCF_LE_ADD_DEVICE_TO_WHITE_LIST command is not implemented) Raises: ValueError: A value had an unexpected format or was not in range """ max_interval = (0x4000 if self.hci_version < HCIVersion.BT_CORE_SPEC_5_0 else 0xFFFF) interval_fractions = interval_ms / MS_FRACTION_DIVIDER if interval_fractions < 0x0004 or interval_fractions > max_interval: raise ValueError( "Invalid interval given {}, must be in range of 2.5ms to {}ms!".format( interval_fractions, max_interval * MS_FRACTION_DIVIDER)) window_fractions = window_ms / MS_FRACTION_DIVIDER if window_fractions < 0x0004 or window_fractions > max_interval: raise ValueError( "Invalid window given {}, must be in range of 2.5ms to {}ms!".format( window_fractions, max_interval * MS_FRACTION_DIVIDER)) interval_fractions, window_fractions = int(interval_fractions), int(window_fractions) if self.hci_version < HCIVersion.BT_CORE_SPEC_5_0: command_field = OCF_LE_SET_SCAN_PARAMETERS scan_parameter_pkg = struct.pack( "<BHHBB", scan_type, interval_fractions, window_fractions, address_type, filter_type) else: command_field = OCF_LE_SET_EXT_SCAN_PARAMETERS scan_parameter_pkg = struct.pack( "<BBBBHH", address_type, filter_type, 1, # scan advertisements on the LE 1M PHY scan_type, interval_fractions, window_fractions) self.backend.send_cmd(self.socket, OGF_LE_CTL, command_field, scan_parameter_pkg) def toggle_scan(self, enable, filter_duplicates=False): """Enables or disables BLE scanning For extended set scan enable command additional parameters duration and period have to be provided. When both are zero, the controller shall continue scanning until scanning is disabled. For non-zero values have a look on BT Core 5.1 Specification, page 1442 (LE Set Extended Scan Enable command). Args: enable: boolean value to enable (True) or disable (False) scanner filter_duplicates: boolean value to enable/disable filter, that omits duplicated packets""" if self.hci_version < HCIVersion.BT_CORE_SPEC_5_0: command_field = OCF_LE_SET_SCAN_ENABLE command = struct.pack("BB", enable, filter_duplicates) else: command_field = OCF_LE_SET_EXT_SCAN_ENABLE command = struct.pack("<BBHH", enable, filter_duplicates, 0, # duration 0 # period ) self.backend.send_cmd(self.socket, OGF_LE_CTL, command_field, command) def process_packet(self, pkt): """Parse the packet and call callback if one of the filters matches.""" payload = pkt[14:-1] if self.hci_version < HCIVersion.BT_CORE_SPEC_5_0 else pkt[29:] # check if this could be a valid packet before parsing # this reduces the CPU load significantly if not self.kwtree.search(payload): return bt_addr = bt_addr_to_string(pkt[7:13]) rssi = bin_to_int(pkt[-1] if self.hci_version < HCIVersion.BT_CORE_SPEC_5_0 else pkt[18]) # strip bluetooth address and parse packet packet = parse_packet(payload) # return if packet was not an beacon advertisement if not packet: return # we need to remeber which eddystone beacon has which bt address # because the TLM and URL frames do not contain the namespace and instance self.save_bt_addr(packet, bt_addr) # properties holds the identifying information for a beacon # e.g. instance and namespace for eddystone; uuid, major, minor for iBeacon properties = self.get_properties(packet, bt_addr) if self.device_filter is None and self.packet_filter is None: # no filters selected self.callback(bt_addr, rssi, packet, properties) elif self.device_filter is None: # filter by packet type if is_one_of(packet, self.packet_filter): self.callback(bt_addr, rssi, packet, properties) else: # filter by device and packet type if self.packet_filter and not is_one_of(packet, self.packet_filter): # return if packet filter does not match return # iterate over filters and call .matches() on each for filtr in self.device_filter: if isinstance(filtr, BtAddrFilter): if filtr.matches({'bt_addr':bt_addr}): self.callback(bt_addr, rssi, packet, properties) return elif filtr.matches(properties): self.callback(bt_addr, rssi, packet, properties) return def save_bt_addr(self, packet, bt_addr): """Add to the list of mappings.""" if isinstance(packet, EddystoneUIDFrame): # remove out old mapping new_mappings = [m for m in self.eddystone_mappings if m[0] != bt_addr] new_mappings.append((bt_addr, packet.properties)) self.eddystone_mappings = new_mappings def get_properties(self, packet, bt_addr): """Get properties of beacon depending on type.""" if is_one_of(packet, [EddystoneTLMFrame, EddystoneURLFrame, \ EddystoneEncryptedTLMFrame, EddystoneEIDFrame]): # here we retrieve the namespace and instance which corresponds to the # eddystone beacon with this bt address return self.properties_from_mapping(bt_addr) else: return packet.properties def properties_from_mapping(self, bt_addr): """Retrieve properties (namespace, instance) for the specified bt address.""" for addr, properties in self.eddystone_mappings: if addr == bt_addr: return properties return None def terminate(self): """Signal runner to stop and join thread.""" self.toggle_scan(False) self.keep_going = False self.join()
def load_words(): words = set() with open(os.path.join(basedir, '../data/dict/words.txt')) as input: for line in input: words.add(line.strip()) return words keywords.update(load_dict()) keywords.update(load_words()) keyword_pattern = re.compile('(' + '|'.join(keywords) + ')', re.IGNORECASE) keyword_tree = KeywordTree(case_insensitive=True) for word in keywords: keyword_tree.add(word) keyword_tree.finalize() def nsfw_text(text: str): if keyword_tree.search(text): return True return False anti_filter_pattern = r'(?<= )(\w)(?= )' def anti_fiter(text: str):
def test_finalize_errors(self): kwtree = KeywordTree(case_insensitive=True) kwtree.add('bla') kwtree.add('blue') self.assertRaises(ValueError, kwtree.search, 'blueb') kwtree = KeywordTree(case_insensitive=True) kwtree.add('bla') kwtree.finalize() self.assertRaises(ValueError, kwtree.add, 'blueb') kwtree = KeywordTree(case_insensitive=True) kwtree.add('bla') kwtree.finalize() self.assertRaises(ValueError, kwtree.finalize)
def test_case_insensitivity_mode(self): kwtree = KeywordTree(case_insensitive=True) kwtree.add('bla') kwtree.add('blue') kwtree.add('blISs') kwtree.finalize() result = kwtree.search('bLa') self.assertEqual(('bla', 0), result) result = kwtree.search('BLISS') self.assertEqual(('blISs', 0), result)
def test_case_sensitivity(self): kwtree = KeywordTree() kwtree.add('bla') kwtree.add('blue') kwtree.add('blISs') kwtree.finalize() result = kwtree.search('bLa') self.assertIsNone(result) result = kwtree.search('BLISS') self.assertIsNone(result) result = kwtree.search('bliss') self.assertIsNone(result) result = kwtree.search('blISs') self.assertEqual(('blISs', 0), result)
def test_unicode(self): kwtree = KeywordTree() kwtree.add('bla') kwtree.add('blue') kwtree.add(u'颜到') kwtree.finalize() result = kwtree.search(u'春华变苍颜到处群魔乱') self.assertEqual((u'颜到', 4), result) result = kwtree.search(u'三年过') self.assertIsNone(result)
def test_domains(self): kwtree = KeywordTree() kwtree.add('searchenginemarketingfordummies.com') kwtree.add('linkpt.com') kwtree.add('fnbpeterstown.com') kwtree.finalize() result = kwtree.search('*****@*****.**') self.assertEqual(('linkpt.com', 10), result)
def test_empty_tree(self): kwtree = KeywordTree() kwtree.finalize() result = kwtree.search('zef') self.assertIsNone(result)
def ahocorasick_any_match(text_info): kwtree_any = KeywordTree(case_insensitive=True) bool_name = False bool_lang = False bool_type = False bool_tags = False bool_categories = False """ CREATE TREE """ ## if name condition is empty, (name is a string) if not text_info[1][2]: bool_name = True else: kwtree_any.add( text_info[1][2]) ## add name condition into aho corasick tree ## if lang condition is empty, (lang is a string) if not text_info[1][3]: bool_lang = True else: kwtree_any.add( text_info[1][3]) ## add lang condition into aho corasick tree ## if type condition is empty, (type is a string) if not text_info[1][4]: bool_type = True else: kwtree_any.add( text_info[1][4]) ## add type condition into aho corasick tree ## if tags condition is empty, (tags is a list) if not text_info[1][5]: bool_name = True else: for tag in text_info[1][5]: kwtree_any.add(tag) ## add tag conditions into aho corasick tree ## if categories condition is empty, (categories is a list) if not text_info[1][6]: bool_name = True else: for categ in text_info[1][6]: kwtree_any.add( categ) ## add categories conditions into aho corasick tree kwtree_any.finalize() """ ANY MATCH """ ## name if kwtree_any.search_one(text_info[1][7]["name"]): bool_name = True ## lang if kwtree_any.search_one(text_info[1][7]["lang"]): bool_lang = True ## type if kwtree_any.search_one(text_info[1][7]["type"]): bool_type = True ## tags tags = helper_list_to_str(text_info[1][7]["tags"]) if kwtree_any.search_one(tags): bool_tags = True ## categories categs = helper_list_to_str(text_info[1][7]["categories"]) if kwtree_any.search_one(categs): bool_categories = True """ RESULT """ if bool_name and bool_lang and bool_type and bool_tags and bool_categories: return text_info else: return False
def __init__(self, filename, minus_words): self.filename = filename self.tree = KeywordTree(case_insensitive=True) for word in minus_words: self.tree.add(word) self.tree.finalize()
'Trump has no clue how to negotiate trade with China or any other country. He has no idea how tariffs really work. He refuses to see how his ridiculous trade war will hurt average Americans.', 'China has 51 female billionaires, South Korea has 1, Japan has none. And the answer might well be down to something Chairman Mao unintentionally set in motion', 'President Trump today called his trade war with China a “little squabble.” It’s not a squabble, it’s a disaster with huge costs for California consumers, farmers and businesses. It’s time to stop holding American consumers and producers hostage as trade talks continue.', 'All that President Trump has to do to win the Great Trade War with China is to give American farmers some tariff money to compensate them. Meanwhile, he can sit back & watch the tech industry get hammered - which they richly deserve. Thats called a win-win.', 'The Trump admin has made $8,520,000,000 in direct payments to farmers through a 2018 aid program designed to counter losses stemming from Trumps trade war with China, Axios reports. ', 'Farmer who voted for Trump says he’ll "never vote for him again" as family set to lose $150,000 in China trade war ', 'Republicans borrowing another 20 billion dollars from China to pay farmers who cannot sell their products to China because of the trade war shows their loyalty to free market principals wait a second I don’t get it.', 'Trump lost 5 points on Rasmussen in the past 3 days. Im guessing delayed reaction to people not understanding the trade war with China and the one day stock selloff.', 'BREAKING NEWS: Chinese people rush to buy $3 Donald Trump toilet brushes amid trade war with the U.S. & joke Trump can be so useful. People in China are cheering for Beijing in a trade war with Washington by cleaning their toilets with brushes that look like the U.S. President. ', 'Most Chinese agree that the US is more powerful than China and Washington holds initiative in the trade war. But we just dont want to cave in and we believe there is no way the US can crush China. We are willing to bear some pain to give the US a lesson.', 'China will fight to the end if President Trumps trade war continues says Beijings ambassador to the UK as he calls Americans troublemakers who are fixated on us first', 'My discussion with soybean farmer and Wisconsin Soybean Association president Tony Mellenthin @MellenthinFarms about how the trade war with China is currently affecting farmers, and what happens once it is over. ' ] # AHO - CORASICK # kwtree = KeywordTree(case_insensitive=True) df = pd.DataFrame() matches = [] tmp = [] clean_keywords = [] # Add keywords to the trie for word in keywords: kwtree.add(word[0]) clean_keywords.append(word[0]) kwtree.finalize() # Run a search on every tweet and add to dataframe for i in range(len(tweets)): matches = []
def test_readme_example(self): ''' As used in the projects README. If you have to change this test case, please update the README accordingly. ''' kwtree = KeywordTree(case_insensitive=True) kwtree.add('malaga') kwtree.add('lacrosse') kwtree.add('mallorca') kwtree.add('mallorca bella') kwtree.add('orca') kwtree.finalize() result = kwtree.search('My favorite islands are malaga and sylt.') self.assertEqual(('malaga', 24), result) result = kwtree.search( 'idontlikewhitespaceswhereismalacrossequestionmark') self.assertEqual(('lacrosse', 29), result) results = kwtree.search_all('malheur on mallorca bellacrosse') self.assertIsNotNone(results) self.assertEqual(('mallorca', 11), next(results)) self.assertEqual(('orca', 15), next(results)) self.assertEqual(('mallorca bella', 11), next(results)) self.assertEqual(('lacrosse', 23), next(results)) with self.assertRaises(StopIteration): next(results)
class OWMCitySlot: def __init__( self, path_to_geo_entities: str = "data/openweathermap_city_list.json" ) -> None: """Initialize a trie for finding city names. :param path_to_geo_entities: filepath to a JSON file containing a list of cities file format: ["Ḩeşār-e Sefīd", "‘Ayn Ḩalāqīm", "Taglag", ..... , "Gerton"] this list was created using the source file: https://bulk.openweathermap.org/sample/city.list.json.gz :type path_to_geo_entities: str """ self.geonames = self._load_from_json(path_to_geo_entities) self.kwtree = KeywordTree(case_insensitive=True) for geo in self.geonames: self.kwtree.add(f" {geo} ") self.kwtree.finalize() def _load_from_json(self, path_to_geo_entities: str) -> List[str]: """Load a list with city names from a JSON file. :param path_to_geo_entities: filepath to a JSON file :type path_to_geo_entities: str :return: a list containing city names :rtype: List[str] """ with open(path_to_geo_entities, "r", encoding="utf-8") as f: json_data = json.load(f) geonames = set() for city in json_data: geonames.add(city) return list(geonames) def find_geo_names_in_utterance(self, utterance: str) -> str: """Search the first occurrence of the location name in utterance. :param utterance: human utterance :type utterance: str :return: a location name or an empty string if nothing found :rtype: str """ # replace punctuation with spaces for p in string.punctuation: utterance = utterance.replace(p, " ") # delete excessive spaces utterance = re.sub(r"\s{2,}", " ", utterance.lower()).strip() results = list(self.kwtree.search_all(" %s " % utterance)) # TODO the method could be improved if we search all geo names and then filter # the most precises geo entity. # User may write: "Massachusetts Boston" -> It has 2 entities, and Boston is preferred # because it is more precise location. return self.get_best_match(results) def get_best_match(self, results: Iterable[Tuple[str, int]]) -> str: """Select from the objects with the lowest index the object with the longest length. Usually the earliest entity is the most precise. For example for the utterance: "west valley city utah", we receive: [(' West ', 0), (' West Valley ', 0), (' Valley ', 5), (' West Valley City ', 0), (' Valley City ', 5), (' Utah ', 17)], we should select "West Valley City". :param results: a sequence with the following pairs (<location_name>, <index>) :type results: Iterable[Sequence[str, int]] :return: the best match or an empty string if the results are empty :rtype: str """ best_match = "" if results: results = sorted(results, key=lambda entity: (entity[1], -len(entity[0].strip()))) best_match = results[0][0].strip() return best_match def __call__(self, *args, **kwargs) -> str: """Find the best match in the trie. :return: a location name or an empty string if nothing found :rtype: str """ return self.find_geo_names_in_utterance(*args, **kwargs)
def test_suffix_stuff(self): kwtree = KeywordTree() kwtree.add('blaaaaaf') kwtree.add('bluez') kwtree.add('aaaamen') kwtree.add('uebergaaat') kwtree.finalize() result = kwtree.search('blaaaaamentada') self.assertEqual(('aaaamen', 3), result) result = kwtree.search('clueuebergaaameblaaaamenbluez') self.assertEqual(('aaaamen', 17), result)
class SymbolExtractor: """ Takes a reddit submission and extracts all mentioned tickers Utilizes the aho corasick algorithm known from antivirus software """ __tickers: DataFrame __searchTree: KeywordTree def __init__(self, ticker_file: str): """ Create new symbol extractor :param ticker_file: Path to a csv file with a Ticker column containing all relevant tickers """ self.__tickers = pd.read_csv(ticker_file, sep="\t") self.__create_search_tree() def extract_symbols(self, submission: Submission) -> List[str]: """ Extracts stock symbols from all text contained in a submission :param submission: to be searched :return: list of all found tickers """ symbols: List[str] = self.__extract_symbols_from_title(submission) symbols += self.__extract_symbols_from_self_text(submission) symbols = self.__remove_duplicates(symbols) return symbols def __extract_symbols_from_title(self, submission: Submission) -> List[str]: """ Extracts symbols from the title of a submission :param submission: to be searched :return: list of all found tickers """ title = submission.title return self.find_symbols_in_text(title) def __extract_symbols_from_self_text(self, submission: Submission) -> List[str]: """ Extracts symbols from the text of a submission :param submission: to be searched :return: list of all found tickers """ if hasattr(submission, "self_text"): text = submission.self_text return self.find_symbols_in_text(text) return [] def find_symbols_in_text(self, text: str) -> List[str]: """ Extracts symbols from a text :param text: to be searched :return: List of all found tickers """ matches = self.__searchTree.search_all(text) match_list = [ticker for (ticker, position) in matches] return match_list def __create_search_tree(self): """ Initializes the search tree with the list of tickers in __tickers """ self.__searchTree = KeywordTree() tickers: Series = self.__tickers.Ticker for ticker in tickers: self.__searchTree.add(ticker) self.__searchTree.finalize() @staticmethod def __remove_duplicates(symbols: List[str]): return list(set(symbols))
def test_simple(self): kwtree = KeywordTree() kwtree.add('bla') kwtree.add('blue') kwtree.finalize() result = kwtree.search('bl') self.assertIsNone(result) result = kwtree.search('') self.assertIsNone(result) result = kwtree.search('zef') self.assertIsNone(result) result = kwtree.search('blaaaa') self.assertEqual(('bla', 0), result) result = kwtree.search('red green blue grey') self.assertEqual(('blue', 10), result)
def init_ahocorapy(): kwtree = KeywordTree() for keyword in keyword_list: kwtree.add(keyword) kwtree.finalize() return kwtree
from ahocorapy.keywordtree import KeywordTree dic_positive = KeywordTree() dic_negative = KeywordTree() # 用言編 with open('data/wago.121808.pn') as yougen: for line in yougen: line_splitted = line.strip().split('\t') if len(line_splitted) != 2: continue polarity_, term_ = line_splitted[:2] polarity = polarity_[:2] term = term_.replace(' ', '') if polarity == 'ポジ': dic_positive.add(term) elif polarity == 'ネガ': dic_negative.add(term) # 名詞編 with open('data/pn.csv.m3.120408.trim') as meishi: for line in meishi: term, polarity = line.strip().split('\t')[:2] if polarity == 'p': dic_positive.add(term) elif polarity == 'n': dic_negative.add(term) dic_positive.finalize() dic_negative.finalize()