Exemple #1
0
    def _load_kw_trees(self) -> List[KeywordTree]:
        """ Загружает префиксные деревья для терминов из словарей (название каждого файла соответствует количеству
        токенов в терминах этого файла

        :return: Список префиксных деревьев
        """
        fnames = [
            '1.txt', '2.txt', '3.txt', '4.txt', '5.txt', '6.txt', '7.txt',
            '8.txt', '9.txt', '10.txt', '11.txt', '12.txt', '13.txt', '14.txt',
            '20.txt'
        ]
        files_dir_path = os.path.join(DICT_EXTRACTOR_PATH, TERMS_DIR_NAME)
        kw_trees = []
        for fname in fnames[::-1]:
            kwtree = KeywordTree()
            with open(os.path.join(files_dir_path, fname), 'r') as f:
                for ngramm in f.read().split('\n'):
                    if ngramm != '':
                        kwtree.add(ngramm.split())
                kwtree.finalize()
                kw_trees.append(kwtree)
        return kw_trees
Exemple #2
0
class Monitor(threading.Thread):
    """Continously scan for BLE advertisements."""
    def __init__(self, callback, bt_device_id, device_filter, packet_filter,
                 scan_parameters):
        """Construct interface object."""
        # do import here so that the package can be used in parsing-only mode (no bluez required)
        self.backend = import_module('beacontools.backend')

        threading.Thread.__init__(self)
        self.daemon = False
        self.keep_going = True
        self.callback = callback

        # number of the bt device (hciX)
        self.bt_device_id = bt_device_id
        # list of beacons to monitor
        self.device_filter = device_filter
        self.mode = get_mode(device_filter)
        # list of packet types to monitor
        self.packet_filter = packet_filter
        # bluetooth socket
        self.socket = None
        # keep track of Eddystone Beacon <-> bt addr mapping
        self.eddystone_mappings = []
        # parameters to pass to bt device
        self.scan_parameters = scan_parameters

        # construct an aho-corasick search tree for efficient prefiltering
        service_uuid_prefix = b"\x03\x03"
        self.kwtree = KeywordTree()
        if self.mode & ScannerMode.MODE_IBEACON:
            self.kwtree.add(
                bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) +
                IBEACON_MANUFACTURER_ID + IBEACON_PROXIMITY_TYPE)
        if self.mode & ScannerMode.MODE_EDDYSTONE:
            self.kwtree.add(service_uuid_prefix + EDDYSTONE_UUID)
        if self.mode & ScannerMode.MODE_ESTIMOTE:
            self.kwtree.add(service_uuid_prefix + ESTIMOTE_UUID)
            self.kwtree.add(
                bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) +
                ESTIMOTE_MANUFACTURER_ID)
        if self.mode & ScannerMode.MODE_CJMONITOR:
            self.kwtree.add(
                bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) + CJ_MANUFACTURER_ID)
        if self.mode & ScannerMode.MODE_EXPOSURE_NOTIFICATION:
            self.kwtree.add(service_uuid_prefix + EXPOSURE_NOTIFICATION_UUID)
        self.kwtree.finalize()

    def run(self):
        """Continously scan for BLE advertisements."""
        self.socket = self.backend.open_dev(self.bt_device_id)

        self.set_scan_parameters(**self.scan_parameters)
        self.toggle_scan(True)

        while self.keep_going:
            pkt = self.socket.recv(255)
            event = to_int(pkt[1])
            subevent = to_int(pkt[3])
            if event == LE_META_EVENT and subevent == EVT_LE_ADVERTISING_REPORT:
                # we have an BLE advertisement
                self.process_packet(pkt)
        self.socket.close()

    def set_scan_parameters(self,
                            scan_type=ScanType.ACTIVE,
                            interval_ms=10,
                            window_ms=10,
                            address_type=BluetoothAddressType.RANDOM,
                            filter_type=ScanFilter.ALL):
        """"sets the le scan parameters

        Args:
            scan_type: ScanType.(PASSIVE|ACTIVE)
            interval: ms (as float) between scans (valid range 2.5ms - 10240ms)
                ..note:: when interval and window are equal, the scan
                    runs continuos
            window: ms (as float) scan duration (valid range 2.5ms - 10240ms)
            address_type: Bluetooth address type BluetoothAddressType.(PUBLIC|RANDOM)
                * PUBLIC = use device MAC address
                * RANDOM = generate a random MAC address and use that
            filter: ScanFilter.(ALL|WHITELIST_ONLY) only ALL is supported, which will
                return all fetched bluetooth packets (WHITELIST_ONLY is not supported,
                because OCF_LE_ADD_DEVICE_TO_WHITE_LIST command is not implemented)

        Raises:
            ValueError: A value had an unexpected format or was not in range
        """
        interval_fractions = interval_ms / MS_FRACTION_DIVIDER
        if interval_fractions < 0x0004 or interval_fractions > 0x4000:
            raise ValueError(
                "Invalid interval given {}, must be in range of 2.5ms to 10240ms!"
                .format(interval_fractions))
        window_fractions = window_ms / MS_FRACTION_DIVIDER
        if window_fractions < 0x0004 or window_fractions > 0x4000:
            raise ValueError(
                "Invalid window given {}, must be in range of 2.5ms to 10240ms!"
                .format(window_fractions))

        interval_fractions, window_fractions = int(interval_fractions), int(
            window_fractions)

        scan_parameter_pkg = struct.pack("<BHHBB", scan_type,
                                         interval_fractions, window_fractions,
                                         address_type, filter_type)
        self.backend.send_cmd(self.socket, OGF_LE_CTL,
                              OCF_LE_SET_SCAN_PARAMETERS, scan_parameter_pkg)

    def toggle_scan(self, enable, filter_duplicates=False):
        """Enables or disables BLE scanning

        Args:
            enable: boolean value to enable (True) or disable (False) scanner
            filter_duplicates: boolean value to enable/disable filter, that
                omits duplicated packets"""
        command = struct.pack("BB", enable, filter_duplicates)
        self.backend.send_cmd(self.socket, OGF_LE_CTL, OCF_LE_SET_SCAN_ENABLE,
                              command)

    def process_packet(self, pkt):
        """Parse the packet and call callback if one of the filters matches."""
        payload = pkt[14:-1]
        # check if this could be a valid packet before parsing
        # this reduces the CPU load significantly
        if not self.kwtree.search(payload):
            return

        bt_addr = bt_addr_to_string(pkt[7:13])
        rssi = bin_to_int(pkt[-1])
        # strip bluetooth address and parse packet
        packet = parse_packet(payload)

        # return if packet was not an beacon advertisement
        if not packet:
            return

        # we need to remeber which eddystone beacon has which bt address
        # because the TLM and URL frames do not contain the namespace and instance
        self.save_bt_addr(packet, bt_addr)
        # properties holds the identifying information for a beacon
        # e.g. instance and namespace for eddystone; uuid, major, minor for iBeacon
        properties = self.get_properties(packet, bt_addr)

        if self.device_filter is None and self.packet_filter is None:
            # no filters selected
            self.callback(bt_addr, rssi, packet, properties)

        elif self.device_filter is None:
            # filter by packet type
            if is_one_of(packet, self.packet_filter):
                self.callback(bt_addr, rssi, packet, properties)
        else:
            # filter by device and packet type
            if self.packet_filter and not is_one_of(packet,
                                                    self.packet_filter):
                # return if packet filter does not match
                return

            # iterate over filters and call .matches() on each
            for filtr in self.device_filter:
                if isinstance(filtr, BtAddrFilter):
                    if filtr.matches({'bt_addr': bt_addr}):
                        self.callback(bt_addr, rssi, packet, properties)
                        return

                elif filtr.matches(properties):
                    self.callback(bt_addr, rssi, packet, properties)
                    return

    def save_bt_addr(self, packet, bt_addr):
        """Add to the list of mappings."""
        if isinstance(packet, EddystoneUIDFrame):
            # remove out old mapping
            new_mappings = [
                m for m in self.eddystone_mappings if m[0] != bt_addr
            ]
            new_mappings.append((bt_addr, packet.properties))
            self.eddystone_mappings = new_mappings

    def get_properties(self, packet, bt_addr):
        """Get properties of beacon depending on type."""
        if is_one_of(packet, [EddystoneTLMFrame, EddystoneURLFrame, \
                              EddystoneEncryptedTLMFrame, EddystoneEIDFrame]):
            # here we retrieve the namespace and instance which corresponds to the
            # eddystone beacon with this bt address
            return self.properties_from_mapping(bt_addr)
        else:
            return packet.properties

    def properties_from_mapping(self, bt_addr):
        """Retrieve properties (namespace, instance) for the specified bt address."""
        for addr, properties in self.eddystone_mappings:
            if addr == bt_addr:
                return properties
        return None

    def terminate(self):
        """Signal runner to stop and join thread."""
        self.toggle_scan(False)
        self.keep_going = False
        self.join()
# arr = [95, 1, 5, 67, 223, 566, 33, 67, 13, 1, 5, 67]

# 0 = bear
# 1 or 2 = bull
day_selector = 1

c_open = float(data[day_selector][2])
c_high = float(data[day_selector][3])
c_low = float(data[day_selector][4])
c_close = float(data[day_selector][5])

# print("Open Ref: " + str(c_open) + "\n")
print(build_candle_id(c_close, c_open, c_high, c_low))

kwtree = KeywordTree(case_insensitive=True)
for index in range(len(data)):
    c_open = float(data[index][2])
    c_high = float(data[index][3])
    c_low = float(data[index][4])
    c_close = float(data[index][5])
    kwtree.add(build_candle_id(c_close, c_open, c_high, c_low))
kwtree.finalize()

results = kwtree.search('S-22.77-51.3-25.93-72')

res_count = 0
for result in results:
    res_count += 1
    print(result)
Exemple #4
0
import random
import pathlib
from termcolor import colored

U = "CTAGTTAG"
V = "bvbccvCTAnGTTAGvfqvsdqvqCTAGTTAcGvfdCTACGATAGvvfGTTGTTfdvCTAtggAGsfsdfdCTAdddddddddddAGvbcvbcvb"

print("\n", "Text : ", V)
print("Motif :", U)

erreur = input("erreur  : ")
#Pi=textwrap.wrap(U, int(erreur))
Pi = [U[i:i + int(erreur)] for i in range(0, len(U), int(erreur))]
print(Pi)
#Aho-Corasick recherch
kwtree = KeywordTree(case_insensitive=True)
for i in range(0, len(Pi)):
    kwtree.add(Pi[i])
kwtree.finalize()
results = kwtree.search_all(V)
#afichage de tout les occurence
Vals = []
Keyz = []
for result in results:
    #print(result)
    Vals.append(result[0])
    Keyz.append(result[1])

dictionary = dict(zip(Keyz, Vals))
print(dictionary, "\n")
Exemple #5
0
import numpy as np
import pandas as pd
import csv
from ahocorapy.keywordtree import KeywordTree

# Initializing Brand Names
brandNamesFile = open('brandNames.csv', 'r')
reader = csv.reader(brandNamesFile)
allRows = [row for row in reader]
brandNames = []

for item in allRows:
	brandNames.append(item[0])

# Initializing ahocorasick search
kwtree = KeywordTree(case_insensitive=True)

for brand in brandNames:
	kwtree.add(brand)

kwtree.finalize()

# Initializing praw reddit client
client_id = input('client_id: ')
client_secret = input('client_secret: ')
username = input('username: '******'password: '******'WAYWT Stats Scraper',
Exemple #6
0
    def test_visualizer(self):
        # Needs working pygraphviz on system
        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('malaga')
        kwtree.add('lacrosse')
        kwtree.add('mallorca')
        kwtree.add('mallorca bella')
        kwtree.add('orca')
        kwtree.finalize()

        visualizer = Visualizer()
        visualizer.draw('readme_example.png', kwtree)
Exemple #7
0
class Monitor(threading.Thread):
    """Continously scan for BLE advertisements."""

    def __init__(self, callback, bt_device_id, device_filter, packet_filter, scan_parameters):
        """Construct interface object."""
        # do import here so that the package can be used in parsing-only mode (no bluez required)
        self.backend = import_module('beacontools.backend')

        threading.Thread.__init__(self)
        self.daemon = False
        self.keep_going = True
        self.callback = callback

        # number of the bt device (hciX)
        self.bt_device_id = bt_device_id
        # list of beacons to monitor
        self.device_filter = device_filter
        self.mode = get_mode(device_filter)
        # list of packet types to monitor
        self.packet_filter = packet_filter
        # bluetooth socket
        self.socket = None
        # keep track of Eddystone Beacon <-> bt addr mapping
        self.eddystone_mappings = []
        # parameters to pass to bt device
        self.scan_parameters = scan_parameters
        # hci version
        self.hci_version = HCIVersion.BT_CORE_SPEC_1_0

        # construct an aho-corasick search tree for efficient prefiltering
        service_uuid_prefix = b"\x03\x03"
        self.kwtree = KeywordTree()
        if self.mode & ScannerMode.MODE_IBEACON:
            self.kwtree.add(bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) + IBEACON_MANUFACTURER_ID + IBEACON_PROXIMITY_TYPE)
        if self.mode & ScannerMode.MODE_EDDYSTONE:
            self.kwtree.add(service_uuid_prefix + EDDYSTONE_UUID)
        if self.mode & ScannerMode.MODE_ESTIMOTE:
            self.kwtree.add(service_uuid_prefix + ESTIMOTE_UUID)
            self.kwtree.add(bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) + ESTIMOTE_MANUFACTURER_ID)
        if self.mode & ScannerMode.MODE_CJMONITOR:
            self.kwtree.add(bytes([MANUFACTURER_SPECIFIC_DATA_TYPE]) + CJ_MANUFACTURER_ID)
        if self.mode & ScannerMode.MODE_EXPOSURE_NOTIFICATION:
            self.kwtree.add(service_uuid_prefix + EXPOSURE_NOTIFICATION_UUID)
        self.kwtree.finalize()

    def run(self):
        """Continously scan for BLE advertisements."""
        self.socket = self.backend.open_dev(self.bt_device_id)

        self.hci_version = self.get_hci_version()
        self.set_scan_parameters(**self.scan_parameters)
        self.toggle_scan(True)

        while self.keep_going:
            pkt = self.socket.recv(255)
            event = to_int(pkt[1])
            subevent = to_int(pkt[3])
            if event == LE_META_EVENT and subevent in [EVT_LE_ADVERTISING_REPORT, EVT_LE_EXT_ADVERTISING_REPORT]:
                # we have an BLE advertisement
                self.process_packet(pkt)
        self.socket.close()

    def get_hci_version(self):
        """Gets the HCI version"""
        local_version = Struct(
            "status" / Byte,
            "hci_version" / Byte,
            "hci_revision" / Bytes(2),
            "lmp_version" / Byte,
            "manufacturer_name" / Bytes(2),
            "lmp_subversion" / Bytes(2),
        )

        try:
            resp = self.backend.send_req(self.socket, OGF_INFO_PARAM, OCF_READ_LOCAL_VERSION,
                                         EVT_CMD_COMPLETE, local_version.sizeof(), bytes(), 0)
            return HCIVersion(GreedyRange(local_version).parse(resp)[0]["hci_version"])
        except (ConstructError, NotImplementedError):
            return HCIVersion.BT_CORE_SPEC_1_0

    def set_scan_parameters(self, scan_type=ScanType.ACTIVE, interval_ms=10, window_ms=10,
                            address_type=BluetoothAddressType.RANDOM, filter_type=ScanFilter.ALL):
        """"Sets the le scan parameters

        For extended set scan parameters command additional parameter scanning PHYs has to be provided.
        The parameter indicates the PHY(s) on which the advertising packets should be received on the
        primary advertising physical channel. For further information have a look on BT Core 5.1 Specification,
        page 1439 ( LE Set Extended Scan Parameters command).

        Args:
            scan_type: ScanType.(PASSIVE|ACTIVE)
            interval: ms (as float) between scans (valid range 2.5ms - 10240ms or 40.95s for extended version)
                ..note:: when interval and window are equal, the scan
                    runs continuos
            window: ms (as float) scan duration (valid range 2.5ms - 10240ms or 40.95s for extended version)
            address_type: Bluetooth address type BluetoothAddressType.(PUBLIC|RANDOM)
                * PUBLIC = use device MAC address
                * RANDOM = generate a random MAC address and use that
            filter: ScanFilter.(ALL|WHITELIST_ONLY) only ALL is supported, which will
                return all fetched bluetooth packets (WHITELIST_ONLY is not supported,
                because OCF_LE_ADD_DEVICE_TO_WHITE_LIST command is not implemented)

        Raises:
            ValueError: A value had an unexpected format or was not in range
        """
        max_interval = (0x4000 if self.hci_version < HCIVersion.BT_CORE_SPEC_5_0 else 0xFFFF)
        interval_fractions = interval_ms / MS_FRACTION_DIVIDER
        if interval_fractions < 0x0004 or interval_fractions > max_interval:
            raise ValueError(
                "Invalid interval given {}, must be in range of 2.5ms to {}ms!".format(
                    interval_fractions, max_interval * MS_FRACTION_DIVIDER))
        window_fractions = window_ms / MS_FRACTION_DIVIDER
        if window_fractions < 0x0004 or window_fractions > max_interval:
            raise ValueError(
                "Invalid window given {}, must be in range of 2.5ms to {}ms!".format(
                    window_fractions, max_interval * MS_FRACTION_DIVIDER))

        interval_fractions, window_fractions = int(interval_fractions), int(window_fractions)

        if self.hci_version < HCIVersion.BT_CORE_SPEC_5_0:
            command_field = OCF_LE_SET_SCAN_PARAMETERS
            scan_parameter_pkg = struct.pack(
                "<BHHBB",
                scan_type,
                interval_fractions,
                window_fractions,
                address_type,
                filter_type)
        else:
            command_field = OCF_LE_SET_EXT_SCAN_PARAMETERS
            scan_parameter_pkg = struct.pack(
                "<BBBBHH",
                address_type,
                filter_type,
                1,  # scan advertisements on the LE 1M PHY
                scan_type,
                interval_fractions,
                window_fractions)

        self.backend.send_cmd(self.socket, OGF_LE_CTL, command_field, scan_parameter_pkg)

    def toggle_scan(self, enable, filter_duplicates=False):
        """Enables or disables BLE scanning

        For extended set scan enable command additional parameters duration and period have
        to be provided. When both are zero, the controller shall continue scanning until
        scanning is disabled. For non-zero values have a look on BT Core 5.1 Specification,
        page 1442 (LE Set Extended Scan Enable command).

        Args:
            enable: boolean value to enable (True) or disable (False) scanner
            filter_duplicates: boolean value to enable/disable filter, that
                omits duplicated packets"""
        if self.hci_version < HCIVersion.BT_CORE_SPEC_5_0:
            command_field = OCF_LE_SET_SCAN_ENABLE
            command = struct.pack("BB", enable, filter_duplicates)
        else:
            command_field = OCF_LE_SET_EXT_SCAN_ENABLE
            command = struct.pack("<BBHH", enable, filter_duplicates,
                                  0,  # duration
                                  0   # period
                                  )

        self.backend.send_cmd(self.socket, OGF_LE_CTL, command_field, command)

    def process_packet(self, pkt):
        """Parse the packet and call callback if one of the filters matches."""
        payload = pkt[14:-1] if self.hci_version < HCIVersion.BT_CORE_SPEC_5_0 else pkt[29:]

        # check if this could be a valid packet before parsing
        # this reduces the CPU load significantly
        if not self.kwtree.search(payload):
            return

        bt_addr = bt_addr_to_string(pkt[7:13])
        rssi = bin_to_int(pkt[-1] if self.hci_version < HCIVersion.BT_CORE_SPEC_5_0 else pkt[18])
        # strip bluetooth address and parse packet
        packet = parse_packet(payload)

        # return if packet was not an beacon advertisement
        if not packet:
            return

        # we need to remeber which eddystone beacon has which bt address
        # because the TLM and URL frames do not contain the namespace and instance
        self.save_bt_addr(packet, bt_addr)
        # properties holds the identifying information for a beacon
        # e.g. instance and namespace for eddystone; uuid, major, minor for iBeacon
        properties = self.get_properties(packet, bt_addr)

        if self.device_filter is None and self.packet_filter is None:
            # no filters selected
            self.callback(bt_addr, rssi, packet, properties)

        elif self.device_filter is None:
            # filter by packet type
            if is_one_of(packet, self.packet_filter):
                self.callback(bt_addr, rssi, packet, properties)
        else:
            # filter by device and packet type
            if self.packet_filter and not is_one_of(packet, self.packet_filter):
                # return if packet filter does not match
                return

            # iterate over filters and call .matches() on each
            for filtr in self.device_filter:
                if isinstance(filtr, BtAddrFilter):
                    if filtr.matches({'bt_addr':bt_addr}):
                        self.callback(bt_addr, rssi, packet, properties)
                        return

                elif filtr.matches(properties):
                    self.callback(bt_addr, rssi, packet, properties)
                    return

    def save_bt_addr(self, packet, bt_addr):
        """Add to the list of mappings."""
        if isinstance(packet, EddystoneUIDFrame):
            # remove out old mapping
            new_mappings = [m for m in self.eddystone_mappings if m[0] != bt_addr]
            new_mappings.append((bt_addr, packet.properties))
            self.eddystone_mappings = new_mappings

    def get_properties(self, packet, bt_addr):
        """Get properties of beacon depending on type."""
        if is_one_of(packet, [EddystoneTLMFrame, EddystoneURLFrame, \
                              EddystoneEncryptedTLMFrame, EddystoneEIDFrame]):
            # here we retrieve the namespace and instance which corresponds to the
            # eddystone beacon with this bt address
            return self.properties_from_mapping(bt_addr)
        else:
            return packet.properties

    def properties_from_mapping(self, bt_addr):
        """Retrieve properties (namespace, instance) for the specified bt address."""
        for addr, properties in self.eddystone_mappings:
            if addr == bt_addr:
                return properties
        return None

    def terminate(self):
        """Signal runner to stop and join thread."""
        self.toggle_scan(False)
        self.keep_going = False
        self.join()
Exemple #8
0

def load_words():
    words = set()
    with open(os.path.join(basedir, '../data/dict/words.txt')) as input:
        for line in input:
            words.add(line.strip())
    return words


keywords.update(load_dict())
keywords.update(load_words())

keyword_pattern = re.compile('(' + '|'.join(keywords) + ')', re.IGNORECASE)

keyword_tree = KeywordTree(case_insensitive=True)
for word in keywords:
    keyword_tree.add(word)
keyword_tree.finalize()


def nsfw_text(text: str):
    if keyword_tree.search(text):
        return True
    return False


anti_filter_pattern = r'(?<= )(\w)(?= )'


def anti_fiter(text: str):
Exemple #9
0
    def test_finalize_errors(self):
        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('bla')
        kwtree.add('blue')

        self.assertRaises(ValueError, kwtree.search, 'blueb')

        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('bla')
        kwtree.finalize()

        self.assertRaises(ValueError, kwtree.add, 'blueb')

        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('bla')
        kwtree.finalize()

        self.assertRaises(ValueError, kwtree.finalize)
Exemple #10
0
    def test_case_insensitivity_mode(self):
        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('bla')
        kwtree.add('blue')
        kwtree.add('blISs')
        kwtree.finalize()

        result = kwtree.search('bLa')
        self.assertEqual(('bla', 0), result)

        result = kwtree.search('BLISS')
        self.assertEqual(('blISs', 0), result)
Exemple #11
0
    def test_case_sensitivity(self):
        kwtree = KeywordTree()
        kwtree.add('bla')
        kwtree.add('blue')
        kwtree.add('blISs')
        kwtree.finalize()

        result = kwtree.search('bLa')
        self.assertIsNone(result)

        result = kwtree.search('BLISS')
        self.assertIsNone(result)

        result = kwtree.search('bliss')
        self.assertIsNone(result)

        result = kwtree.search('blISs')
        self.assertEqual(('blISs', 0), result)
Exemple #12
0
    def test_unicode(self):
        kwtree = KeywordTree()
        kwtree.add('bla')
        kwtree.add('blue')
        kwtree.add(u'颜到')
        kwtree.finalize()

        result = kwtree.search(u'春华变苍颜到处群魔乱')
        self.assertEqual((u'颜到', 4), result)

        result = kwtree.search(u'三年过')
        self.assertIsNone(result)
Exemple #13
0
    def test_domains(self):
        kwtree = KeywordTree()
        kwtree.add('searchenginemarketingfordummies.com')
        kwtree.add('linkpt.com')
        kwtree.add('fnbpeterstown.com')
        kwtree.finalize()

        result = kwtree.search('*****@*****.**')
        self.assertEqual(('linkpt.com', 10), result)
Exemple #14
0
    def test_empty_tree(self):
        kwtree = KeywordTree()
        kwtree.finalize()

        result = kwtree.search('zef')
        self.assertIsNone(result)
def ahocorasick_any_match(text_info):
    kwtree_any = KeywordTree(case_insensitive=True)
    bool_name = False
    bool_lang = False
    bool_type = False
    bool_tags = False
    bool_categories = False
    """
    CREATE TREE
    
    """
    ## if name condition is empty, (name is a string)
    if not text_info[1][2]:
        bool_name = True

    else:
        kwtree_any.add(
            text_info[1][2])  ## add name condition into aho corasick tree

    ## if lang condition is empty, (lang is a string)
    if not text_info[1][3]:
        bool_lang = True

    else:
        kwtree_any.add(
            text_info[1][3])  ## add lang condition into aho corasick tree

    ## if type condition is empty, (type is a string)
    if not text_info[1][4]:
        bool_type = True

    else:
        kwtree_any.add(
            text_info[1][4])  ## add type condition into aho corasick tree

    ## if tags condition is empty, (tags is a list)
    if not text_info[1][5]:
        bool_name = True

    else:
        for tag in text_info[1][5]:
            kwtree_any.add(tag)  ## add tag conditions into aho corasick tree

    ## if categories condition is empty, (categories is a list)
    if not text_info[1][6]:
        bool_name = True

    else:
        for categ in text_info[1][6]:
            kwtree_any.add(
                categ)  ## add categories conditions into aho corasick tree

    kwtree_any.finalize()
    """
    ANY MATCH
    
    """
    ## name
    if kwtree_any.search_one(text_info[1][7]["name"]):
        bool_name = True

    ## lang
    if kwtree_any.search_one(text_info[1][7]["lang"]):
        bool_lang = True

    ## type
    if kwtree_any.search_one(text_info[1][7]["type"]):
        bool_type = True

    ## tags
    tags = helper_list_to_str(text_info[1][7]["tags"])
    if kwtree_any.search_one(tags):
        bool_tags = True

    ## categories
    categs = helper_list_to_str(text_info[1][7]["categories"])
    if kwtree_any.search_one(categs):
        bool_categories = True
    """
    RESULT
    """

    if bool_name and bool_lang and bool_type and bool_tags and bool_categories:

        return text_info
    else:
        return False
Exemple #16
0
 def __init__(self, filename, minus_words):
     self.filename = filename
     self.tree = KeywordTree(case_insensitive=True)
     for word in minus_words:
         self.tree.add(word)
     self.tree.finalize()
    'Trump has no clue how to negotiate trade with China or any other country. He has no idea how tariffs really work. He refuses to see how his ridiculous trade war will hurt average Americans.',
    'China has 51 female billionaires, South Korea has 1, Japan has none. And the answer might well be down to something Chairman Mao unintentionally set in motion',
    'President Trump today called his trade war with China a “little squabble.” It’s not a squabble, it’s a disaster with huge costs for California consumers, farmers and businesses. It’s time to stop holding American consumers and producers hostage as trade talks continue.',
    'All that President Trump has to do to win the Great Trade War with China is to give American farmers some tariff money to compensate them. Meanwhile, he can sit back & watch the tech industry get hammered - which they richly deserve. Thats called a win-win.',
    'The Trump admin has made $8,520,000,000 in direct payments to farmers through a 2018 aid program designed to counter losses stemming from Trumps trade war with China, Axios reports. ',
    'Farmer who voted for Trump says he’ll "never vote for him again" as family set to lose $150,000 in China trade war ',
    'Republicans borrowing another 20 billion dollars from China to pay farmers who cannot sell their products to China because of the trade war shows their loyalty to free market principals wait a second I don’t get it.',
    'Trump lost 5 points on Rasmussen in the past 3 days.  Im guessing delayed reaction to people not understanding the trade war with China and the one day stock selloff.',
    'BREAKING NEWS: Chinese people rush to buy $3 Donald Trump toilet brushes amid trade war with the U.S. & joke Trump can be so useful. People in China are cheering for Beijing in a trade war with Washington by cleaning their toilets with brushes that look like the U.S. President. ',
    'Most Chinese agree that the US is more powerful than China and Washington holds initiative in the trade war. But we just dont want to cave in and we believe there is no way the US can crush China. We are willing to bear some pain to give the US a lesson.',
    'China will fight to the end if President Trumps trade war continues says Beijings ambassador to the UK as he calls Americans troublemakers who are fixated on us first',
    'My discussion with soybean farmer and Wisconsin Soybean Association president Tony Mellenthin @MellenthinFarms about how the trade war with China is currently affecting farmers, and what happens once it is over. '
]

# AHO - CORASICK #
kwtree = KeywordTree(case_insensitive=True)

df = pd.DataFrame()
matches = []
tmp = []
clean_keywords = []
# Add keywords to the trie
for word in keywords:
    kwtree.add(word[0])
    clean_keywords.append(word[0])
kwtree.finalize()

# Run a search on every tweet and add to dataframe

for i in range(len(tweets)):
    matches = []
Exemple #18
0
    def test_readme_example(self):
        '''
        As used in the projects README. If you have to change this test case,
        please update the README accordingly.
        '''
        kwtree = KeywordTree(case_insensitive=True)
        kwtree.add('malaga')
        kwtree.add('lacrosse')
        kwtree.add('mallorca')
        kwtree.add('mallorca bella')
        kwtree.add('orca')
        kwtree.finalize()

        result = kwtree.search('My favorite islands are malaga and sylt.')
        self.assertEqual(('malaga', 24), result)

        result = kwtree.search(
            'idontlikewhitespaceswhereismalacrossequestionmark')
        self.assertEqual(('lacrosse', 29), result)

        results = kwtree.search_all('malheur on mallorca bellacrosse')
        self.assertIsNotNone(results)
        self.assertEqual(('mallorca', 11), next(results))
        self.assertEqual(('orca', 15), next(results))
        self.assertEqual(('mallorca bella', 11), next(results))
        self.assertEqual(('lacrosse', 23), next(results))
        with self.assertRaises(StopIteration):
            next(results)
Exemple #19
0
class OWMCitySlot:
    def __init__(
        self,
        path_to_geo_entities: str = "data/openweathermap_city_list.json"
    ) -> None:
        """Initialize a trie for finding city names.

        :param path_to_geo_entities: filepath to a JSON file containing a list of cities
            file format: ["Ḩeşār-e Sefīd", "‘Ayn Ḩalāqīm", "Taglag", ..... , "Gerton"]
            this list was created using the source file: https://bulk.openweathermap.org/sample/city.list.json.gz
        :type path_to_geo_entities: str
        """
        self.geonames = self._load_from_json(path_to_geo_entities)
        self.kwtree = KeywordTree(case_insensitive=True)
        for geo in self.geonames:
            self.kwtree.add(f" {geo} ")
        self.kwtree.finalize()

    def _load_from_json(self, path_to_geo_entities: str) -> List[str]:
        """Load a list with city names from a JSON file.

        :param path_to_geo_entities: filepath to a JSON file
        :type path_to_geo_entities: str
        :return: a list containing city names
        :rtype: List[str]
        """
        with open(path_to_geo_entities, "r", encoding="utf-8") as f:
            json_data = json.load(f)
        geonames = set()
        for city in json_data:
            geonames.add(city)
        return list(geonames)

    def find_geo_names_in_utterance(self, utterance: str) -> str:
        """Search the first occurrence of the location name in utterance.

        :param utterance: human utterance
        :type utterance: str
        :return: a location name or an empty string if nothing found
        :rtype: str
        """
        # replace punctuation with spaces
        for p in string.punctuation:
            utterance = utterance.replace(p, " ")
        # delete excessive spaces
        utterance = re.sub(r"\s{2,}", " ", utterance.lower()).strip()
        results = list(self.kwtree.search_all(" %s " % utterance))
        # TODO the method could be improved if we search all geo names and then filter
        # the most precises geo entity.
        # User may write: "Massachusetts Boston" -> It has 2 entities, and Boston is preferred
        # because it is more precise location.
        return self.get_best_match(results)

    def get_best_match(self, results: Iterable[Tuple[str, int]]) -> str:
        """Select from the objects with the lowest index the object with the longest length.

         Usually the earliest entity is the most precise.
         For example for the utterance: "west valley city utah", we receive:
         [(' West ', 0), (' West Valley ', 0), (' Valley ', 5), (' West Valley City ', 0),
         (' Valley City ', 5), (' Utah ', 17)], we should select "West Valley City".

        :param results: a sequence with the following pairs (<location_name>, <index>)
        :type results: Iterable[Sequence[str, int]]
        :return: the best match or an empty string if the results are empty
        :rtype: str
        """
        best_match = ""
        if results:
            results = sorted(results,
                             key=lambda entity:
                             (entity[1], -len(entity[0].strip())))
            best_match = results[0][0].strip()
        return best_match

    def __call__(self, *args, **kwargs) -> str:
        """Find the best match in the trie.

        :return: a location name or an empty string if nothing found
        :rtype: str
        """
        return self.find_geo_names_in_utterance(*args, **kwargs)
Exemple #20
0
    def test_suffix_stuff(self):
        kwtree = KeywordTree()
        kwtree.add('blaaaaaf')
        kwtree.add('bluez')
        kwtree.add('aaaamen')
        kwtree.add('uebergaaat')
        kwtree.finalize()

        result = kwtree.search('blaaaaamentada')
        self.assertEqual(('aaaamen', 3), result)

        result = kwtree.search('clueuebergaaameblaaaamenbluez')
        self.assertEqual(('aaaamen', 17), result)
class SymbolExtractor:
    """
    Takes a reddit submission and extracts all mentioned tickers
    Utilizes the aho corasick algorithm known from antivirus software
    """

    __tickers: DataFrame
    __searchTree: KeywordTree

    def __init__(self, ticker_file: str):
        """
        Create new symbol extractor

        :param ticker_file: Path to a csv file with a Ticker column containing all relevant tickers
        """
        self.__tickers = pd.read_csv(ticker_file, sep="\t")
        self.__create_search_tree()

    def extract_symbols(self, submission: Submission) -> List[str]:
        """
        Extracts stock symbols from all text contained in a submission

        :param submission: to be searched
        :return: list of all found tickers
        """
        symbols: List[str] = self.__extract_symbols_from_title(submission)
        symbols += self.__extract_symbols_from_self_text(submission)
        symbols = self.__remove_duplicates(symbols)
        return symbols

    def __extract_symbols_from_title(self, submission: Submission) -> List[str]:
        """
        Extracts symbols from the title of a submission

        :param submission: to be searched
        :return: list of all found tickers
        """
        title = submission.title
        return self.find_symbols_in_text(title)

    def __extract_symbols_from_self_text(self, submission: Submission) -> List[str]:
        """
        Extracts symbols from the text of a submission

        :param submission: to be searched
        :return: list of all found tickers
        """
        if hasattr(submission, "self_text"):
            text = submission.self_text
            return self.find_symbols_in_text(text)
        return []

    def find_symbols_in_text(self, text: str) -> List[str]:
        """
        Extracts symbols from a text

        :param text: to be searched
        :return: List of all found tickers
        """
        matches = self.__searchTree.search_all(text)
        match_list = [ticker for (ticker, position) in matches]
        return match_list

    def __create_search_tree(self):
        """
        Initializes the search tree with the list of tickers in __tickers
        """
        self.__searchTree = KeywordTree()
        tickers: Series = self.__tickers.Ticker
        for ticker in tickers:
            self.__searchTree.add(ticker)
        self.__searchTree.finalize()

    @staticmethod
    def __remove_duplicates(symbols: List[str]):
        return list(set(symbols))
Exemple #22
0
    def test_simple(self):
        kwtree = KeywordTree()
        kwtree.add('bla')
        kwtree.add('blue')
        kwtree.finalize()

        result = kwtree.search('bl')
        self.assertIsNone(result)

        result = kwtree.search('')
        self.assertIsNone(result)

        result = kwtree.search('zef')
        self.assertIsNone(result)

        result = kwtree.search('blaaaa')
        self.assertEqual(('bla', 0), result)

        result = kwtree.search('red green blue grey')
        self.assertEqual(('blue', 10), result)
def init_ahocorapy():
    kwtree = KeywordTree()
    for keyword in keyword_list:
        kwtree.add(keyword)
    kwtree.finalize()
    return kwtree
Exemple #24
0
from ahocorapy.keywordtree import KeywordTree

dic_positive = KeywordTree()
dic_negative = KeywordTree()

# 用言編
with open('data/wago.121808.pn') as yougen:
    for line in yougen:
        line_splitted = line.strip().split('\t')
        if len(line_splitted) != 2:
            continue
        polarity_, term_ = line_splitted[:2]
        polarity = polarity_[:2]
        term = term_.replace(' ', '')
        if polarity == 'ポジ':
            dic_positive.add(term)
        elif polarity == 'ネガ':
            dic_negative.add(term)

# 名詞編
with open('data/pn.csv.m3.120408.trim') as meishi:
    for line in meishi:
        term, polarity = line.strip().split('\t')[:2]
        if polarity == 'p':
            dic_positive.add(term)
        elif polarity == 'n':
            dic_negative.add(term)

dic_positive.finalize()
dic_negative.finalize()