class OperateCsv(object): """for csv file """ def __init__(self): self.store = Store() def read_csv_columns(self, source_file, *cared_title, **kwargs): """read_csv_columns to read csv specific one or more columns. +--------------------+------+------------------------------------+ | Input Paramaters | Man. | Description | +--------------------+------+------------------------------------+ | source_file | yes | Path of csv file | +--------------------+------+------------------------------------+ | cared_title | yes | one or more cared column's title | +--------------------+------+------------------------------------+ | return | [colA_list] or [[colA_list], [colB_list]] | +--------------------+------+------------------------------------+ Example: +------------------+--------------+-------------+------------+ | read_csv_columns | /opt/xxx.csv | Time | | +------------------+--------------+-------------+------------+ | read_csv_columns | /opt/xxx.csv | Time | SFN | +------------------+--------------+-------------+------------+ """ result = [] csv_obj = CsvHandler(source_file, cared_title[0]) self.store.add(csv_obj, alias=kwargs.get('alias')) result = csv_obj.get_csv_columns_list(*cared_title) self.store.remove(alias=kwargs.get('alias')) return result
def all(self, data): solutions = Store() tracker = Tracker() final = self.size - 1 pos = (0, 0) try: while True: if (data[pos[0]][pos[1]] == 0): pos, data = self.attempt(pos, data, tracker) if (pos[0] == final and pos[1] == final): solutions.add(data) pos, data = self.attempt(pos, data, tracker) if (pos[1] == final): pos = (pos[0] + 1, 0) else: pos = (pos[0], pos[1] + 1) except Exception as e: if (str(e) != 'There is no solution!'): raise e return solutions.all()
class OperateTxt(object): """for setup/teardown/modify/add/remove/get txt file """ def __init__(self): self.store = Store() def setup_txt(self, src_file, save_path=None, **kwargs): """setup txt file :param string src_file: the source txt file :param string save_path: the save txt file name.if None will use source file name """ hander = TxtHandler() if not save_path: save_path = src_file hander.setup(src_file, save_path) self.store.add(hander, alias=kwargs.get('alias')) def teardown_txt(self, **kwargs): """teardown txt file """ alias = kwargs.get('alias') self.store.get(alias).teardown() self.store.remove(alias=kwargs.get('alias')) def modify_txt_node(self, *args, **kwargs): """modify txt node :param string args: such as 0x10042:0x0A691892 0x10043:51015 """ alias = kwargs.get('alias') self.store.get(alias).modify_node(*args) def add_txt_node(self, *args, **kwargs): """add txt node :param string args: such as '0x10301:1#MAC L2' or '0x10302:2' """ alias = kwargs.get('alias') self.store.get(alias).add_node(*args) def delete_txt_node(self, *args, **kwargs): """delete txt node :param string args: such as '0x10049' """ alias = kwargs.get('alias') self.store.get(alias).delete_node(*args) def read_txt_node(self, *args, **kwargs): """get txt node text value :param string args: such as '0x10040' """ alias = kwargs.get('alias') return self.store.get(alias).get_node_text(*args)
def go_store(): store = Store() store.remove(None) store.add(323, 2, 10, 88, 78, 415, 89, 189) store.remove(78) store.save() store.load() print store.grep('\d*[02468]$') print store.find(89, -1, 415) for item in store.items(): print item
class UnzipFile(object): """for unzip .zip file """ def __init__(self): self.store = Store() self._log = logging.getLogger(__name__) self._log.setLevel(logging.DEBUG) def unzip_file(self, source_file, to_path=None, **kwargs): """ Uncompress '.gz','.tar','.tar.bz2','.tar.gz','.tgz','.tz2','.docx','.egg','.jar','.odg','.odp','.ods','.xlsx','.odt','.pptx','.zip' file +------------------+------+--------------------------------+ | Input Parameters | Man. | Description | +------------------+------+--------------------------------+ | source_file | yes | Path of compressed file | +------------------+------+--------------------------------+ | to_path | no | Path to save uncompressed file | +------------------+------+--------------------------------+ Example: +------------------+------------------------------------------------+----------------------------------+ | unzip_file | /home/ute/ta_kiss_files/example/snapshot.zip | /home/ute/ta_kiss_files/example | +------------------+------------------------------------------------+----------------------------------+ The uncompressed files is saved to /home/ute/ta_kiss_files/example/snapshot/... """ if source_file.endswith(('.tar.gz', '.tar.bz2')): source_file_name = os.path.splitext( os.path.splitext(source_file)[0])[0] else: source_file_name = os.path.splitext(source_file)[0] if to_path is None: to_path = source_file_name else: to_path = os.path.join(to_path, os.path.basename(source_file_name)) if os.path.isdir(to_path): shutil.rmtree(to_path) unzip_handler = ArchiveFile() self.store.add(unzip_handler, alias=kwargs.get('alias')) unzip_handler.deep_unzip_file(source_file, to_path) self.store.remove(alias=kwargs.get('alias'))
class Encryptor(object): def __init__(self): # self.fileName = fileName self.dbUser = os.environ.get('API_USER') self.dbPw = os.environ.get('API_PASSWORD') self.st = Store(self.dbUser, self.dbPw) def encrypt(self, message): key = Fernet.generate_key() f = Fernet(key) encoded = message.encode() encrypted = f.encrypt(encoded) hsh = self.generate_hash(encrypted) _strkey = key.decode() doc = { "hsh": hsh, "key": _strkey, "destroy": False, "date": datetime.datetime.utcnow() } self.st.add(doc) return encrypted def decrypt(self, encryptedMessage, destroy=False): hsh = self.generate_hash(encryptedMessage) if destroy: d = self.st.desFind({'hsh': hsh}) else: d = self.st.find({'hsh': hsh}) if d == None: return -1 key = d['key'].encode() f = Fernet(key) _msg = encryptedMessage res = f.decrypt(_msg).decode() return res def destroy(self, encryptedMessage): hsh = self.generate_hash(encryptedMessage) fnd = self.st.desFind({'hsh': hsh}) return fnd != {} def generate_hash(self, encrypted): return hashlib.sha224(encrypted).hexdigest()
class DDSketch(object): def __init__(self, alpha=None, bin_limit=None, min_value=None): # Make sure the parameters are valid if alpha is None or (alpha <= 0 or alpha >= 1): alpha = DEFAULT_ALPHA if bin_limit is None or bin_limit < 0: bin_limit = DEFAULT_BIN_LIMIT if min_value < 0: min_value = DEFAULT_MIN_VALUE self.gamma = 1 + 2 * alpha / (1 - alpha) self.gamma_ln = math.log1p(2 * alpha / (1 - alpha)) self.min_value = min_value self.offset = -int(math.ceil(math.log(min_value) / self.gamma_ln)) + 1 self.store = Store(bin_limit) self._min = float('+inf') self._max = float('-inf') self._count = 0 self._sum = 0 def __repr__(self): return "store: {{{}}}, count: {}, sum: {}, min: {}, max: {}".format( self.store, self._count, self._sum, self._min, self._max) @property def name(self): return 'DDSketch' @property def num_values(self): return self._count @property def avg(self): return float(self._sum) / self._count @property def sum(self): return self._sum def get_key(self, val): if val < -self.min_value: return -int(math.ceil( math.log(-val) / self.gamma_ln)) - self.offset elif val > self.min_value: return int(math.ceil(math.log(val) / self.gamma_ln)) + self.offset else: return 0 def add(self, val): """ Add a value to the sketch. """ key = self.get_key(val) self.store.add(key) # Keep track of summary stats self._count += 1 self._sum += val if val < self._min: self._min = val if val > self._max: self._max = val def quantile(self, q): if q < 0 or q > 1 or self._count == 0: return np.NaN if q == 0: return self._min if q == 1: return self._max rank = int(q * (self._count - 1) + 1) key = self.store.key_at_rank(rank) if key < 0: key += self.offset quantile = -2 * pow(self.gamma, -key) / (1 + self.gamma) elif key > 0: key -= self.offset quantile = 2 * pow(self.gamma, key) / (1 + self.gamma) else: quantile = 0 return max(quantile, self._min) def merge(self, sketch): if not self.mergeable(sketch): raise UnequalSketchParametersException( "Cannot merge two DDSketches with different parameters") if sketch._count == 0: return if self._count == 0: self.copy(sketch) return # Merge the stores self.store.merge(sketch.store) # Merge summary stats self._count += sketch._count self._sum += sketch._sum if sketch._min < self._min: self._min = sketch._min if sketch._max > self._max: self._max = sketch._max def mergeable(self, other): """ Two sketches can be merged only if their gamma and min_values are equal. """ return self.gamma == other.gamma and self.min_value == other.min_value def copy(self, sketch): self.store.copy(sketch.store) self._min = sketch._min self._max = sketch._max self._count = sketch._count self._sum = sketch._sum
class Encryptor(object): """ The Encryptor object, handles encryption and decryption """ def __init__(self): # process mongodb environs try: self.dbUser = os.environ.get("API_USER") self.dbPw = os.environ.get("API_PASSWORD") except: self.dbUser = 0 self.dbPw = 0 self.dbUrl = os.environ.get("API_URL") # Add store object to interact with the database self.st = Store(self.dbUrl, self.dbUser, self.dbPw) # Emoji converter self.conv = EmojiConverter("emojList.txt") def encrypt(self, message, emoji=False) -> str: # Use fernet to generate key key = Fernet.generate_key() f = Fernet(key) # Compress the intended message via zlib encoded = zlib.compress(message.encode()) # The encryption encrypted = f.encrypt(encoded) # Generate the hash of the encrypteds tring hsh = generate_hash(encrypted) # decoded key to store in mongodb _strkey = key.decode() # Store data in mongodb doc = { "hsh": hsh, "key": _strkey, "destroy": False, "compress": "zlib", "date": datetime.datetime.utcnow(), } self.st.add(doc) # Return the encrypted message encrypted = encrypted.decode() if emoji: encrypted = self.conv.sentence_to_emoji(encrypted) return encrypted def decrypt(self, encryptedMessage, destroy=False) -> str: # hashing the message em = encryptedMessage.decode() if self.conv.is_emoji(em[0]): encryptedMessage = self.conv.emoji_to_sentence(em).encode() hsh = generate_hash(encryptedMessage) # Find the dataset in mongodb if destroy: d = self.st.desFind({"hsh": hsh}) else: d = self.st.find({"hsh": hsh}) if d == None: return -1 # decrypt with key key = d["key"].encode() f = Fernet(key) _msg = encryptedMessage res = f.decrypt(_msg) res = zlib.decompress(res).decode() return res def destroy(self, encryptedMessage) -> bool: hsh = generate_hash(encryptedMessage) fnd = self.st.desFind({"hsh": hsh}) return fnd != {}
class TestStore(unittest2.TestCase): def setUp(self): self.store = Store(name="scratch") self.ns = Namespace('http://example.com/#') def tearDown(self): self.store.close() def testSize(self): """ Tests the size of the repository """ self.assertEqual(len(self.store), 0) def testAdd(self): bob = self.ns['bob'] name = self.ns['name'] value = Literal('Bob Bilbins') self.store.add((bob, name, value)) self.assertEqual(len(self.store), 1) def testRemove(self): triple = (self.ns['alice'], self.ns['name'], Literal('Alice')) self.store.add(triple) self.assertEqual(len(self.store), 1) self.store.remove(triple) self.assertEqual(len(self.store), 0) def testTriples(self): """ Tests the search by triple. """ triple = (self.ns['alice'], self.ns['name'], Literal('Alice')) self.store.add(triple) for tri in self.store.triples((self.ns['alice'], None, None)): for i in range(3): self.assertEqual(tri[i], triple[i]) def testSimpleSparql(self): triple = (self.ns['alice'], self.ns['name'], Literal('Alice')) self.store.add(triple) for tri in self.store.query("SELECT ?s ?p ?o WHERE {?s ?p ?o .}"): for i in range(3): self.assertEqual(tri[i], triple[i]) def testNamespacedSparql(self): triple = (self.ns['alice'], self.ns['name'], Literal('Alice')) self.store.add(triple) self.store.add((self.ns['bob'], self.ns['name'], Literal('Bob'))) for tri in self.store.query("SELECT ?p ?o WHERE { ex:alice ?p ?o .}", initNs={'ex': self.ns}): for i in range(1, 3): self.assertEqual(tri[i - 1], triple[i]) def testBindedSparql(self): triple = (self.ns['alice'], self.ns['name'], Literal('Alice')) self.store.add(triple) self.store.add((self.ns['bob'], self.ns['name'], Literal('Bob'))) for tri in self.store.query("SELECT ?p ?o WHERE { ?s ?p ?o .}", initBindings={'s': self.ns['alice']}): for i in range(1, 3): self.assertEqual(tri[i - 1], triple[i]) def testDataTypes(self): birth = Literal('2006-01-03', datatype=_XSD_NS.date) comp = Literal('2006-01-01', datatype=_XSD_NS.date) triple = (self.ns['alice'], self.ns['birthdate'], birth) self.store.add(triple) for s, p, o in self.store.query("SELECT ?s ?p ?o WHERE {?s ?p ?o .}"): self.assertLess(comp, birth)
class BaseScraper(object): """ basic scraper framework for grabbing press releases Derived scrapers generally need to implement: name - string name of the scraper doc_type - numeric document type for uploaded press releases find_latest() - to grab a list of the latest press releases (usually from an rss feed) extract() - parse html data to pull out the various text and metadata of the press release """ def __init__(self): # derived classes need to set these assert self.name is not None assert self.doc_type is not None self.parser = OptionParser(usage="%prog: [options]") self.parser.add_option('-v', '--verbose', action='store_true') self.parser.add_option('-d', '--debug', action='store_true') self.parser.add_option('-t', '--test', action='store_true', help="test only - don't send any documents to server") self.parser.add_option('-c', '--cache', action='store_true', help="cache all http transfers in .cache dir (for repeated runs during test)") self.parser.add_option('-u', '--url', nargs=1, help="process just the given URL") self.parser.add_option('-i', '--ini-file', default="churnalism.cfg", nargs=1, help="filename for connection settings [default: %default]") def main(self): """ set everything up, then invoke go() """ (options, args) = self.parser.parse_args() log_level = logging.ERROR if options.debug: log_level = logging.DEBUG elif options.verbose: log_level = logging.INFO logging.basicConfig(level=log_level) #, format='%(message)s') if options.test: self.store = DummyStore(self.name, self.doc_type) else: # load in config file for real run config = ConfigParser.ConfigParser() config.readfp(open(options.ini_file)) auth_user = config.get("DEFAULT",'user') auth_pass = config.get("DEFAULT",'pass') server = config.get("DEFAULT",'server') self.store = Store(self.name, self.doc_type, auth_user=auth_user, auth_pass=auth_pass, server=server) if options.cache: logging.info("using .cache") opener = urllib2.build_opener(CacheHandler(".cache")) urllib2.install_opener(opener) self.go(options) def go(self,options): """ perform the actual scraping default implementation is to just call find_latest and process the discovered press releases. But it's likely derived classes will want to handle custom options for fetching historical data see prnewswire for an example. """ if options.url: urls = [options.url,] else: urls = self.find_latest() self.process_batch(urls) def process_batch(self, urls): """ run through a list of urls, fetching, extracting and storing each in turn """ # cull out ones we've got n_before = len(urls) urls = [url for url in urls if not self.store.already_got(url)] logging.info("processing %d urls (%d are new)", n_before, len(urls)) err_cnt = 0 try: for url in urls: try: logging.debug("fetch %s",url) response = urllib2.urlopen(url) html = response.read() # TODO: maybe just skip ones which redirect to other domains? if response.geturl() != url: logging.warning("Redirect detected %s => %s",url,response.geturl()) press_release = self.extract(html, url) # encode text fields # TODO: use isinstance(...,unicode) instead for f in ('url','title','source','text','location','language','topics'): if f in press_release: press_release[f] = press_release[f].encode('utf-8') self.store.add(press_release) except Exception as e: logging.error("failed on %s: %s %s",url,e.__class__,e) err_cnt += 1 finally: self.store.save() def find_latest(self): """ obtain the list of "latest" press releases, whatever that means for a given target """ return [] def extract(self,html,url): """ extract a single downloaded press release """ assert False # need to implement in derived class!
class OperateXls(object): """for excel file """ def __init__(self): self.store = Store() self._log = logging.getLogger(__name__) self._log.setLevel(logging.DEBUG) def read_excel_cell(self, source_file, sheet_name, x_cell, y_cell, **kwargs): """read_excel_cell to read excel specific cell. +--------------------+------+-------------------------------------------+ | Input Paramaters | Man. | Description | +--------------------+------+-------------------------------------------+ | source_file | yes | Absolute path of excel file | +--------------------+------+-------------------------------------------+ | sheet_name | yes | sheet name in excel file | +--------------------+------+-------------------------------------------+ | x_cell | yes | the number of row(start value is zero) | +--------------------+------+-------------------------------------------+ | y_cell | yes | the number of column(start value is zero) | +--------------------+------+-------------------------------------------+ excel demo: +----+------+------+------+-----+ |0 |A |B |C |D | +----+------+------+------+-----+ |1 |0,0 |0,1 |0,2 |0,3 | +----+------+------+------+-----+ |2 |1,0 |1,1 |1,2 |1,3 | +----+------+------+------+-----+ |3 |2,0 |2,1 |2,2 |2,3 | +----+------+------+------+-----+ |4 |3,0 |3,1 |3,2 |3,3 | +----+------+------+------+-----+ Example: +--------------------+--------------+---------+----+----+ |read_excel_cell |/opt/xxx.xls |Sheet1 |1 |2 | +--------------------+--------------+---------+----+----+ """ xls = XlsHandler(source_file, sheet_name) self.store.add(xls, alias=kwargs.get('alias')) xls.open_excel() content = xls.read_cell(x_cell, y_cell) self.store.remove(alias=kwargs.get('alias')) return content def modify_excel_cell(self, source_file, sheet_name, x_cell, y_cell, in_value, **kwargs): """write_excel_cell to write or modify excel specific cell. make sure the excel file is closed when you use this keyword. +--------------------+------+-------------------------------------------+ | Input Paramaters | Man. | Description | +--------------------+------+-------------------------------------------+ | source_file | yes | Absolute path of excel file | +--------------------+------+-------------------------------------------+ | sheet_name | yes | sheet name in excel file | +--------------------+------+-------------------------------------------+ | x_cell | yes | the number of row(start value is zero) | +--------------------+------+-------------------------------------------+ | y_cell | yes | the number of column(start value is zero) | +--------------------+------+-------------------------------------------+ | in_value | yes | new value need to write or modify | +--------------------+------+-------------------------------------------+ excel demo: +----+------+------+------+-----+ |0 |A |B |C |D | +----+------+------+------+-----+ |1 |0,0 |0,1 |0,2 |0,3 | +----+------+------+------+-----+ |2 |1,0 |1,1 |1,2 |1,3 | +----+------+------+------+-----+ |3 |2,0 |2,1 |2,2 |2,3 | +----+------+------+------+-----+ |4 |3,0 |3,1 |3,2 |3,3 | +----+------+------+------+-----+ Example: +--------------------+--------------+--------+---+---+-----+ |modify_excel_cell |/opt/xxx.xls |Sheet1 |1 |2 |333 | +--------------------+--------------+--------+---+---+-----+ """ xls = XlsHandler(source_file, sheet_name) self.store.add(xls, alias=kwargs.get('alias')) # file operation, if exist, copy and modify, if not create one if os.path.isfile(source_file): xls.open_excel() else: xls.create_excel() # sheet operation, if exist, get index, if not create one xls.get_sheet_index(sheet_name) # cell operation xls.write_cell(x_cell, y_cell, in_value) # modification save xls.save() self.store.remove(alias=kwargs.get('alias'))
class OperateBin(object): """for bin file """ def __init__(self): self.store = Store() self.file_object = '' self.bin_handler = BinHandler(self.file_object) def read_bin_file(self, bin_file, tag_name, **kwargs): """read_bin_file to read bin file's text by tag name. <SwVersion>5912129</SwVersion> tag_name: SwVersion text: 5912129 +--------------------+------+-------------------------------+ | Input Paramaters | Man. | Description | +--------------------+------+-------------------------------+ | bin_file | yes | Path of bin file | +--------------------+------+-------------------------------+ | tag_name | yes | tag name in bin file <xxx> | +--------------------+------+-------------------------------+ | return | tag name's text | +--------------------+--------------------------------------+ Example: +----------+---------------+--------------+------------+ | ${value} | read_bin_file | /opt/xxx.bin | SwVersion | +----------+---------------+--------------+------------+ """ with open(bin_file, 'r+b') as self.file_object: self.bin_handler = BinHandler(self.file_object) self.store.add(self.bin_handler, alias=kwargs.get('alias')) try: value = self.bin_handler.find_tag_position(tag_name) except TAFileException: err_info = 'NOT FOUND' return err_info self.store.remove(alias=kwargs.get('alias')) return value[0][1].decode('utf-8') def modify_bin_file(self, bin_file, tag_name, new_value, **kwargs): """replace_text_in_bin_file to modify bin file's text by tag name, the new value should have the same length with old one. <SwVersion>5912129</SwVersion> tag_name: SwVersion text: 5912129 +--------------------+------+------------------------------------------------+ | Input Paramaters | Man. | Description | +--------------------+------+------------------------------------------------+ | bin_file | yes | Path of bin file | +--------------------+------+------------------------------------------------+ | tag_name | yes | Tag name in bin file <xxx> | +--------------------+------+------------------------------------------------+ | new_value | yes | New value should have same length with old one | +--------------------+------+------------------------------------------------+ Example: +---------------------------------+--------------+-----------+---------+ | modify_bin_file | /opt/xxx.bin | SwVersion | 5912128 | +---------------------------------+--------------+-----------+---------+ """ with open(bin_file, 'r+b') as self.file_object: self.bin_handler = BinHandler(self.file_object) self.store.add(self.bin_handler, alias=kwargs.get('alias')) try: self.bin_handler.replace_text_in_bin_file(tag_name, new_value) except TAFileException: err_info = 'Fail to modify' return err_info self.store.remove(alias=kwargs.get('alias'))
class OperateXml(object): """for setup/teardown/modify/add/remove/get xml file """ def __init__(self): self.store = Store() def setup_xml(self, src_file, save_path=None, **kwargs): """setup xml file :param string src_file: the source xml file :param string save_path: the save xml file name.if None will use source file name """ hander = XmlHandler() if not save_path: save_path = src_file hander.setup(src_file, save_path) self.store.add(hander, alias=kwargs.get('alias')) def teardown_xml(self, **kwargs): """teardown xml file """ alias = kwargs.get('alias') self.store.get(alias).teardown() self.store.remove(alias=kwargs.get('alias')) def modify_xml_text(self, *args, **kwargs): """modify xml node text :param string args: such as .//managedObject[@class="LNBTS"]/p[@name="actDLCAggr"]:false """ alias = kwargs.get('alias') self.store.get(alias).modify_node_text(*args) def modify_xml_attribute(self, *args, **kwargs): """modify xml node attribute :param string args: such as .//managedObject[@class\="NOKLTE:LNCEL_TDD"]:@version\=TL17_1610_01_1610_02 """ alias = kwargs.get('alias') self.store.get(alias).modify_node_attribute(*args) def add_xml_node(self, *args, **kwargs): """add xml node :param string args: such as .//managedObject[@class="LNBTS"]/list[@name="qciTab6"]/item:<p name="nbrDl">10240</p> """ alias = kwargs.get('alias') self.store.get(alias).add_node(*args) def delete_xml_node(self, *args, **kwargs): """delete xml node :param string args: such as .//managedObject[@class="LNCEL"][@distName="0"]/list/item/p[@name="dFpucchF1b"] """ alias = kwargs.get('alias') self.store.get(alias).delete_node(*args) def read_xml_text(self, *args, **kwargs): """get xml node text value :param string args: such as .//managedObject[@class="LNCEL"][@distName="0"]/list/item/p[@name="dFpucchF1b"] """ alias = kwargs.get('alias') return self.store.get(alias).get_node_text(*args) def read_xml_attribute(self, *args, **kwargs): """get xml node attribute value :param string args: such as .//managedObject[@class\="NOKLTE:LNCEL_TDD"]:@version """ alias = kwargs.get('alias') return self.store.get(alias).get_node_attribute(*args)
class FtpUploadDownload(object): """upload and download files """ def __init__(self): self.ftp_handler = FtpHandler() self.sftp_handler = SFtpHandler() self.store = Store() def ftp_download(self, host, port, usr, pwd, remote, local=None, **kwargs): """ftp download file :param host: such as '192.168.255.1' :param port: such as 21 :param usr: username such as 'admin' :param pwd: password such as 'admin' :param remote: file in host such as '/tmp/rat_psconfig.xml' :param local: file path to save in local such as 'example/config1/rat1.xml','example/config1/', 'rat1.xml', None """ self.store.add(self.ftp_handler, alias=kwargs.get('alias')) alias = kwargs.get('alias') self.store.get(alias).connect_ftp(host, port, usr, pwd) dnload_file = self.store.get(alias).ftp_download(remote, local) self.store.get(alias).close_ftp() self.store.remove(alias=kwargs.get('alias')) return dnload_file def ftp_upload(self, host, port, usr, pwd, local, remote=None, **kwargs): """ftp upload file :param host: such as '192.168.255.1' :param port: such as 21 :param usr: username such as 'admin' :param pwd: password such as 'admin' :param local: file in local such as '/tmp/rat_psconfig.xml' :param remote: file path to save in host such as '/tmp/tmp1/rat_psconfig_test1.xml', '/tmp/tmp1/', 'rat_psconfig_test1.xml', None """ self.store.add(self.ftp_handler, alias=kwargs.get('alias')) alias = kwargs.get('alias') self.store.get(alias).connect_ftp(host, port, usr, pwd) self.store.get(alias).ftp_upload(local, remote) self.store.get(alias).close_ftp() self.store.remove(alias=kwargs.get('alias')) def sftp_download(self, host, port, usr, pwd, remote, local=None, **kwargs): """sftp download file or dir recursively :param host: such as '192.168.255.1' :param port: such as 22 :param usr: username such as 'admin' :param pwd: password such as 'admin' :param remote: file in host such as '/tmp/rat_psconfig.xml' or folder '/tmp/'(NOT '/tmp') :param local: file path to save in local such as 'example/config1/rat1.xml','example/config1/', 'rat1.xml', None """ self.store.add(self.sftp_handler, alias=kwargs.get('alias')) alias = kwargs.get('alias') dl_file = self.store.get(alias).sftp_download(host, port, usr, pwd, remote, local) self.store.remove(alias=kwargs.get('alias')) return dl_file def download_latest_file(self, host, port, usr, pwd, remote, local=None, **kwargs): """sftp download file :param host: such as '192.168.255.1' :param port: such as 22 :param usr: username such as 'admin' :param pwd: password such as 'admin' :param remote: path in host such as '/tmp/' :param local: file path to save in local such as 'example/config1/rat1.xml','example/config1/', 'rat1.xml', None :param filter in **kwargs: such as: filter = PM.*.xml """ self.store.add(self.sftp_handler, alias=kwargs.get('alias')) alias = kwargs.get('alias') self.store.get(alias).sftp_download_latest_file( host, port, usr, pwd, remote, local, filter=kwargs.get('filter')) self.store.remove(alias=kwargs.get('alias')) def sftp_upload(self, host, port, usr, pwd, local, remote=None, **kwargs): """sftp upload file :param host: such as '192.168.255.1' :param port: such as 22 :param usr: username such as 'admin' :param pwd: password such as 'admin' :param local: file in local such as '/tmp/rat_psconfig.xml' :param remote: file path to save in host such as '/tmp/tmp1/rat_psconfig_test1.xml', '/tmp/tmp1/', 'rat_psconfig_test1.xml', None """ self.store.add(self.sftp_handler, alias=kwargs.get('alias')) alias = kwargs.get('alias') self.store.get(alias).sftp_upload(host, port, usr, pwd, local, remote) self.store.remove(alias=kwargs.get('alias'))
class BaseScraper(object): """ basic scraper framework for grabbing press releases Derived scrapers generally need to implement: name - string name of the scraper doc_type - numeric document type for uploaded press releases find_latest() - to grab a list of the latest press releases (usually from an rss feed) extract() - parse html data to pull out the various text and metadata of the press release """ require_same_domain = False disallow_redirects = False headers = {} def __init__(self): # derived classes need to set these assert self.name is not None assert self.doc_type is not None self.parser = OptionParser(usage="%prog: [options]") self.parser.add_option('-v', '--verbose', action='store_true') self.parser.add_option('-d', '--debug', action='store_true') self.parser.add_option( '-t', '--test', action='store_true', help="test only - don't send any documents to server") self.parser.add_option( '-c', '--cache', action='store_true', help= "cache all http transfers in .cache dir (for repeated runs during test)" ) self.parser.add_option('-u', '--url', nargs=1, help="process just the given URL") self.parser.add_option( '-i', '--ini-file', default="churnalism.cfg", nargs=1, help="filename for connection settings [default: %default]") def main(self): """ set everything up, then invoke go() """ (options, args) = self.parser.parse_args() log_level = logging.ERROR if options.debug: log_level = logging.DEBUG elif options.verbose: log_level = logging.INFO logging.basicConfig(level=log_level) #, format='%(message)s') if options.test: self.store = DummyStore(self.name, self.doc_type) else: # load in config file for real run config = ConfigParser.ConfigParser() config.readfp(open(options.ini_file)) auth_user = config.get("DEFAULT", 'user') auth_pass = config.get("DEFAULT", 'pass') server = config.get("DEFAULT", 'server') self.store = Store(self.name, self.doc_type, auth_user=auth_user, auth_pass=auth_pass, server=server) self.go(options) def go(self, options): """ perform the actual scraping default implementation is to just call find_latest and process the discovered press releases. But it's likely derived classes will want to handle custom options for fetching historical data see prnewswire for an example. """ if options.url: urls = [ options.url, ] else: urls = self.find_latest() self.process_batch(urls) def process_batch(self, urls, extra_headers=None): """ run through a list of urls, fetching, extracting and storing each in turn """ # cull out ones we've got n_before = len(urls) urls = [url for url in urls if not self.store.already_got(url)] logging.info("processing %d urls (%d are new)", n_before, len(urls)) err_cnt = 0 try: for url in urls: try: logging.debug("fetch %s", url) headers = {} headers.update(self.headers) if extra_headers: headers.update(extra_headers) response = requests.get(url, headers=headers) # TODO: maybe just skip ones which redirect to other domains? if response.url != url: if self.disallow_redirects == True: logging.warning( "Skipping %s because it redirected to %s", url, response.url) continue elif self.require_same_domain == True: orig_location = urlparse.urlparse(url) new_location = urlparse.urlparse(response.url) if orig_location.netloc != new_location.netloc: logging.warning( "Skipping %s because it redirected to another domain: %s", url, response.url) continue press_release = self.extract(response.text, url) # encode text fields # TODO: use isinstance(...,unicode) instead for f in ('url', 'title', 'source', 'text', 'location', 'language', 'topics'): if f in press_release: press_release[f] = press_release[f].encode('utf-8') self.store.add(press_release) except Exception as e: logging.error("failed on %s: %s %s", url, e.__class__, e) print traceback.print_exc() err_cnt += 1 finally: self.store.save() def find_latest(self): """ obtain the list of "latest" press releases, whatever that means for a given target """ return [] def extract(self, html, url): """ extract a single downloaded press release """ assert False # need to implement in derived class!
class TestStore(unittest2.TestCase): def setUp(self): self.store = Store(name="scratch") self.ns = Namespace('http://example.com/#') def tearDown(self): self.store.close() def testSize(self): """ Tests the size of the repository """ self.assertEqual(len(self.store),0) def testAdd(self): bob = self.ns['bob'] name = self.ns['name'] value = Literal('Bob Bilbins') self.store.add((bob, name, value)) self.assertEqual(len(self.store),1) def testRemove(self): triple = (self.ns['alice'],self.ns['name'],Literal('Alice')) self.store.add(triple) self.assertEqual(len(self.store),1) self.store.remove(triple) self.assertEqual(len(self.store),0) def testTriples(self): """ Tests the search by triple. """ triple = (self.ns['alice'],self.ns['name'],Literal('Alice')) self.store.add(triple) for tri in self.store.triples((self.ns['alice'],None, None)): for i in range(3): self.assertEqual(tri[i], triple[i]) def testSimpleSparql(self): triple = (self.ns['alice'],self.ns['name'],Literal('Alice')) self.store.add(triple) for tri in self.store.query("SELECT ?s ?p ?o WHERE {?s ?p ?o .}"): for i in range(3): self.assertEqual(tri[i], triple[i]) def testNamespacedSparql(self): triple = (self.ns['alice'],self.ns['name'],Literal('Alice')) self.store.add(triple) self.store.add((self.ns['bob'],self.ns['name'],Literal('Bob'))) for tri in self.store.query("SELECT ?p ?o WHERE { ex:alice ?p ?o .}", initNs={'ex':self.ns}): for i in range(1,3): self.assertEqual(tri[i-1], triple[i]) def testBindedSparql(self): triple = (self.ns['alice'],self.ns['name'],Literal('Alice')) self.store.add(triple) self.store.add((self.ns['bob'],self.ns['name'],Literal('Bob'))) for tri in self.store.query("SELECT ?p ?o WHERE { ?s ?p ?o .}", initBindings={'s':self.ns['alice']}): for i in range(1,3): self.assertEqual(tri[i-1], triple[i]) def testDataTypes(self): birth = Literal('2006-01-03', datatype=_XSD_NS.date) comp = Literal('2006-01-01', datatype=_XSD_NS.date) triple = (self.ns['alice'],self.ns['birthdate'],birth) self.store.add(triple) for s, p, o in self.store.query("SELECT ?s ?p ?o WHERE {?s ?p ?o .}"): self.assertLess(comp,birth)
class KeyboardHandler: def __init__(self): self.shouldRun = True self.store = Store() self.state = State() def OnKeyboardEvent(self, event): print('Ascii:', event.Ascii, chr(event.Ascii)) lastPress = self.store.lastPress() keyPressed = chr(event.Ascii) self.store.add(keyPressed) if self.state.isStarting(): if chr(event.Ascii) == 'e' and lastPress != 'e': # Prepare to exit, capture 'e' return False if chr(event.Ascii) != 'e' and lastPress == 'e': # Exit sequence aborted, release 'e' SendKeypress(0x12) if chr(event.Ascii) == 'e' and lastPress == 'e': # Run 'ee' exit command self.store.clear() self.shouldRun = False return False if chr(event.Ascii) == 'a' and lastPress != 'a': # Prepare to enter left click mode, capture 'a' return False if chr(event.Ascii) != 'a' and lastPress == 'a': # Left click sequence aborted, release 'a' SendKeypress(0x1E) if chr(event.Ascii) == 'a' and lastPress == 'a': # Enter 'aa' left click mode self.store.clear() self.state.enterMouseMode('leftClick') return False if self.state.isPositioning(): # Placeholder if chr(event.Ascii) == 'a': self.store.clear() self.state.clear() return False if event.Ascii == 27: # Reset if 'esc' key entered self.store.clear() self.state.clear() return True def start(self): hookManager = pyHook.HookManager() hookManager.KeyDown = self.OnKeyboardEvent hookManager.HookKeyboard() while self.shouldRun: pythoncom.PumpWaitingMessages()