def test_find_most_similair_conformation(self): molref = """ OpenBabel12071321472D 2 1 0 0 0 0 0 0 0 0999 V2000 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 0.5000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 0 0 0 0 M END""" mol = """ OpenBabel12071321472D 2 1 0 0 0 0 0 0 0 0999 V2000 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 0.7000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 0 0 0 0 M END """ can = mol_2_can(mol) dbi = DBInterface(self.ses) db_ethane = Molecule(SMILES='CC') dbi.add_molecule(db_ethane) db_mol = dbi.get_molecule(can) db_mol.add_conformation(Conformation(Mol=molref)) self.ses.add(db_mol) db_conf = db_mol.find_most_similair_conf(mol)[0] assert str(db_conf.Mol) == molref
def setUp(self): create_database("tests.db", "y") self.dbi = DBInterface("tests.db") self.maillists = [{ "id": 1, "name": "Hack Bulgaria" }, { "id": 2, "name": "HackFMI" }] self.subscribers = [{ "subscriber_id": 1, "name": "RadoRado", "email": "*****@*****.**" }, { "subscriber_id": 2, "name": "IvoIvo", "email": "*****@*****.**" }] self.maillists_to_subscribers = [{ "maillist_id": 1, "subscriber_id": 1 }, { "maillist_id": 1, "subscriber_id": 2 }, { "maillist_id": 2, "subscriber_id": 2 }, { "maillist_id": 2, "subscriber_id": 3 }, { "maillist_id": 2, "subscriber_id": 4 }]
def restore_products(self, data, seqnum): assert self.amazon_world is not None keys = [ 'Warehouse ID', 'Product ID', 'Product Name', 'Amount To Purchase', 'Local Storage ID' ] for key in keys: if key not in data: raise UserKeyException( '\"{}\" does not exist in restore_products data'.format( key)) warehouse_ids = data['Warehouse ID'] product_ids = data['Product ID'] descriptions = data['Product Name'] counts = data['Amount To Purchase'] local_storage_ids = data['Local Storage ID'] cfg.logger.info('Restore from warehouse.') assert len(warehouse_ids) == len(product_ids) == len( descriptions) == len(counts) for i in range(len(warehouse_ids)): self.amazon_world.purchase_more_send(warehouse_ids[i], product_ids[i], descriptions[i], counts[i], seqnum.value) db_command1 = 'UPDATE miniamazon_app_product SET storage = ' + str(cfg.max_storage) + \ ' WHERE product_id = ' + str(product_ids[i]) db_command2 = 'UPDATE miniamazon_app_localstorage SET storage = ' + str(cfg.max_storage) + \ ' WHERE local_storage_id = ' + str(local_storage_ids[i]) db_interface = DBInterface(cfg.db_name, cfg.db_user, cfg.db_password, cfg.db_host, cfg.db_port) db_interface.setup_excecute_and_close(db_command1) db_interface.setup_excecute_and_close(db_command2) seqnum.value += 1
def test_add_0_rmsd_conf_existing_mol(self): coord = """ OpenBabel12071320143D 5 4 0 0 0 0 0 0 0 0999 V2000 1.0663 0.0688 -0.0295 C 0 0 0 0 0 0 0 0 0 0 0 0 2.1585 0.0688 -0.0295 H 0 0 0 0 0 0 0 0 0 0 0 0 0.7023 0.7365 0.7544 H 0 0 0 0 0 0 0 0 0 0 0 0 0.7023 -0.9440 0.1568 H 0 0 0 0 0 0 0 0 0 0 0 0 0.7023 0.4138 -0.9997 H 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 0 0 0 0 1 3 1 0 0 0 0 1 4 1 0 0 0 0 1 5 1 0 0 0 0 M END """ conf = Conformation(Mol=coord, Molecule_SMILES='C') can = mol_2_can(coord) dbi = DBInterface(self.ses) db_methane = Molecule(SMILES='C') dbi.add_molecule(db_methane) db_mol = dbi.get_molecule(can) db_mol.add_conformation(conf) self.ses.add(db_mol) db_conf = db_mol.get_conformation(coord, 0.0000001) assert db_conf == conf
def call_truck_recv(self, recv_wrapper, packages_data): # truck arrived num_of_arrived_truck = len(recv_wrapper.get_data().truckArrived) for i in range(num_of_arrived_truck): for key, package in packages_data.items(): truck_id = get_truckid_from_sql(key) if truck_id is None: truck_id = recv_wrapper.get_data().truckArrived[i].truck_id assert truck_id is not None db_command = 'UPDATE miniamazon_app_ordercollection SET truck_id = ' + str(truck_id) + \ ' WHERE order_collection_id = ' + str(key) db_interface = DBInterface(cfg.db_name, cfg.db_user, cfg.db_password, cfg.db_host, cfg.db_port) db_interface.setup_excecute_and_close(db_command) break # for key, package in packages_data.items(): # truck_id = get_truckid_from_sql(key) # if truck_id is None: # truck_id = recv_wrapper.get_data().truckArrived[0].truck_id # assert truck_id is not None # db_command = 'UPDATE miniamazon_app_ordercollection SET truck_id = ' + str(truck_id) + \ # ' WHERE order_collection_id = ' + str(key) # db_interface = DBInterface(cfg.db_name, cfg.db_user, cfg.db_password, cfg.db_host, cfg.db_port) # db_interface.setup_excecute_and_close(db_command) # break return recv_wrapper
def test_add_molecule(self): smiles = 'CCCCCCC' mol = Molecule(SMILES=smiles, IUPACName='Heptane') dbi = DBInterface(self.ses) dbi.add_molecule(mol) data_mol = dbi.get_molecule(smiles) assert str(data_mol.SMILES) == str(mol.SMILES)
def go_deliver_recv(self, recv_wrapper, packages_data): for data in recv_wrapper.get_data().delivered: key = data.packageid db_command = 'UPDATE miniamazon_app_ordercollection SET status = \'Delivered\'' + \ ' WHERE order_collection_id = ' + str(key) db_interface = DBInterface(cfg.db_name, cfg.db_user, cfg.db_password, cfg.db_host, cfg.db_port) db_interface.setup_excecute_and_close(db_command) return recv_wrapper
def __init__(self, width: int = 7, height: int = 6) -> None: # set up relevant variables self.__width = width self.__height = height # establish a connection to sqlite database via a "DBInterface" object self.__db_interface = DBInterface() # generate a new board with given width and height by creating according entries in database self.__db_interface.generate_board(width, height)
def init_warehouse(warehouse_data): for data in warehouse_data: db_command = 'INSERT INTO miniamazon_app_warehouse(warehouse_id, x, y) SELECT ' + \ str(data['warehouse_id']) + ', ' + str(data['x']) + ', ' + str(data['y']) + \ ' WHERE NOT EXISTS(select * from miniamazon_app_warehouse WHERE warehouse_id=' + \ str(data['warehouse_id']) + ')' db_interface = DBInterface(cfg.db_name, cfg.db_user, cfg.db_password, cfg.db_host, cfg.db_port) db_interface.setup_excecute_and_close(db_command)
def test_write_new_mol_and_new_conf_and_conf_props(self): rism_toluene = RISM_3D_calculation('test/toluene') can = 'Cc1ccccc1' wr = Writer(self.ses) wr.write_rism(rism_toluene, {'Source': 'David'}) dbi = DBInterface(self.ses) db_mol = dbi.get_molecule(can) db_conf = db_mol.find_0rmsd_conformation() assert str(db_conf.Source) == 'David' self.ses.rollback()
def test_write_rism_calculation(self): rism_toluene = RISM_3D_calculation('test/toluene') can = 'Cc1ccccc1' wr = Writer(self.ses) wr.write_rism(rism_toluene, {'Source': 'David'}) dbi = DBInterface(self.ses) db_mol = dbi.get_molecule(can) db_conf = db_mol.find_0rmsd_conformation() db_calc = db_conf.get_rism_calculations()[0] assert db_calc.SolvE == 1.34305025E+001 assert db_calc.Temperature == '298' self.ses.rollback()
def put_on_truck_recv(self, recv_wrapper, packages_data): # send ack back to world ack_wrapper = MessageWrapper(self.create_acks_msg_in_batch(recv_wrapper.get_data().loaded)) cfg.logger.info('Send acks: {}'.format(ack_wrapper.parse_itself(wapb.ACommands))) ack_wrapper.encode_and_send(self.socket) for data in recv_wrapper.get_data().loaded: key = data.shipid db_command = 'UPDATE miniamazon_app_ordercollection SET status = \'Loaded\'' + \ ' WHERE order_collection_id = ' + str(key) db_interface = DBInterface(cfg.db_name, cfg.db_user, cfg.db_password, cfg.db_host, cfg.db_port) db_interface.setup_excecute_and_close(db_command) return recv_wrapper
def test_write_new_mol_and_new_0_conf(self): rism_toluene = RISM_3D_calculation('test/toluene') can = 'Cc1ccccc1' wr = Writer(self.ses) wr.write_rism(rism_toluene) dbi = DBInterface(self.ses) db_mol = dbi.get_molecule(can) db_can = str(db_mol.SMILES) db_conf = db_mol.find_0rmsd_conformation() pymol = pybel.readstring('mol', str(db_conf.Mol)) assert can == db_can # assert xyz == db_xyz self.ses.rollback()
def status_message(message): with DBInterface(config.database_name) as db: courses = db.get_chat_courses(message.chat.id) if not courses: bot.send_message(message.chat.id,**config.md_mes(config.no_courses_message)) return bot.send_message(message.chat.id,**config.md_mes(format_status(courses)))
def check(self): with DBInterface(config.database_name) as db: courses = db.get_courses() for course_id, old_name, old_last_check, old_busy, old_places in courses: old_full = (old_busy >= old_places) new_id, new_name, new_busy, new_places = parser.get_course_info( str(course_id)) t = time.time() with DBInterface(config.database_name) as db: db.add_course(new_id, new_name, new_busy, new_places, t) if new_name != old_name: self.notice_change_name(course_followers) if new_busy >= new_places: continue with DBInterface(config.database_name) as db: course_followers = db.get_course_chats(new_id) self.notification(old_full, new_busy, new_places, course_followers)
def get_truckid_from_sql(orderid, verbose=True): db_interface = DBInterface(cfg.db_name, cfg.db_user, cfg.db_password, cfg.db_host, cfg.db_port) db_command = 'SELECT truck_id FROM miniamazon_app_ordercollection WHERE order_collection_id = ' + str( orderid) db_interface.setup() db_interface.execute(db_command, verbose) truck_id = db_interface.cursor.fetchone()[0] db_interface.close() return truck_id
def __init__(self, parent, width, height): tk.Frame.__init__(self, parent) self.parent = parent self.window_width = width self.window_height = height self.frames = self.build_layout() self.interface = DBInterface(DB_NAME) self.entries_initialized = False self.entry_list = self.interface.get_all_entries() self.parameter_fields = { 'location_null': tk.BooleanVar(), 'state': tk.StringVar(), 'city': tk.StringVar(), 'tags': tk.StringVar(), 'title': tk.StringVar(), 'company': tk.StringVar(), 'remote': tk.BooleanVar(), 'results': tk.StringVar() } self.current_info_dict = {} self.sort_order = { 'id': 'ASC', 'title': 'ASC', 'company': 'ASC', 'state': 'ASC', 'published': 'ASC' } # self.entryScrolledText = st.ScrolledText(self.frames['jobsFrame']) # self.entryScrolledText.config(width=self.frames['jobsFrame']['width']) # self.entryScrolledText.grid(row=0, column=0) self.entryScrolledText = tk.Text( self.frames['jobsFrame'], width=self.frames['jobsFrame']['width'] ) self.entryScrolledText.grid(row=0, column=0) self.entryScrollbar = tk.Scrollbar(self.frames['jobsFrame']) self.entryScrollbar.grid(row=0, column=1) self.add_frame_components()
def parse_rism_folders(base_folder): ses = create_session() dbi = DBInterface(ses) dirs = os.walk(base_folder) for p, _, filenames in dirs: dbfs = get_dbf(filenames) for dbf in dbfs: try: add_meta_f_to_db(dbf, p, dbi) except NoResultFound: print 'No results found for {}'.format(dbf) ses.commit() ses.close()
def init_database(): # warehouse, products, orders, local_storages, carts # miniamazon_app_localstorage_product, miniamazon_app_product_warehouse table_names = [ 'miniamazon_app_cart_product', 'miniamazon_app_cart_warehouse', 'miniamazon_app_cart', 'miniamazon_app_localstorage_product', 'miniamazon_app_localstorage', 'miniamazon_app_product_order', 'miniamazon_app_product_warehouse', 'miniamazon_app_order_warehouse', 'miniamazon_app_order', 'miniamazon_app_ordercollection', 'miniamazon_app_product', 'miniamazon_app_warehouse', ] for table_name in table_names: db_command = 'DELETE FROM ' + table_name db_interface = DBInterface(cfg.db_name, cfg.db_user, cfg.db_password, cfg.db_host, cfg.db_port) db_interface.setup_excecute_and_close(db_command)
def follow_handler(message): try: if len(message.text.split())>=1: for mes in message.text.split()[1:]: course_info = parser.get_course_info(mes) t = time.time() with DBInterface(config.database_name) as db: db = DBInterface(config.database_name) db.add_course(*course_info,add_time=t) db.follow(message.chat.id,course_info[0],t) bot.send_message(message.chat.id, **config.md_mes(config.successful_follow_message % course_info[1:])) else: bot.send_message(message.chat.id,**config.md_mes(config.no_link_error)) except DatabaseError as e: bot.send_message(message.chat.id, **config.md_mes(config.database_error_message)) print(str(e))
def unfollow_handler(message): try: if len(message.text.split())>=1: for mes in message.text.split()[1:]: course_info = parser.get_course_info(mes) with DBInterface(config.database_name) as db: res = db.unfollow(course_info[0], message.chat.id) if res: bot.send_message(message.chat.id, **config.md_mes( config.successful_unfollow_message % course_info[1])) else: bot.send_message(message.chat.id, **config.md_mes( config.not_following_message % course_info[1])) else: bot.send_message(message.chat.id,**config.md_mes(config.no_link_error)) except DatabaseError as e: bot.send_message(message.chat.id, **config.md_mes(config.database_error_message)) print(str(e))
def test_update_existing_conf(self): rism_toluene = RISM_3D_calculation('test/toluene') rism_toluene_copy = RISM_3D_calculation('test/toluene_copy') can = 'Cc1ccccc1' wr = Writer(self.ses) wr.write_rism(rism_toluene) dbi = DBInterface(self.ses) db_mol = dbi.get_molecule(can) db_conf = db_mol.find_0rmsd_conformation() assert str(db_conf.Source) == 'None' wr.write_rism(rism_toluene_copy, {'Source': 'David'}) dbi = DBInterface(self.ses) db_mol = dbi.get_molecule(can) db_conf = db_mol.find_0rmsd_conformation() assert str(db_conf.Source) == 'David' self.ses.rollback()
class Board: def __init__(self, width: int = 7, height: int = 6) -> None: # set up relevant variables self.__width = width self.__height = height # establish a connection to sqlite database via a "DBInterface" object self.__db_interface = DBInterface() # generate a new board with given width and height by creating according entries in database self.__db_interface.generate_board(width, height) def do_move(self, column: int, player_id: int) -> None: # find field on which the token will "fall", starting from the bottom of the given column # it is guaranteed at this point that the move is valid y = 0 while self.__db_interface.get_field(column, y) != 0: y += 1 # let "db_interface" update the database self.__db_interface.set_field(column, y, player_id) # save this most recent move as an (x, y) position in database - this makes it easier to check for a win self.__db_interface.add_to_history(column, y) # when a bot does a move to figure out if it's a good one, the move needs to be resettable because bot only "thinks" def undo_move(self) -> None: # find out what move needs to be reset undo_x, undo_y = self.__db_interface.get_last_move() # actually reset it self.__db_interface.set_field(undo_x, undo_y, 0) # delete the latest move from history self.__db_interface.clear_last_move() def get_board(self) -> dict: # simply return the dictionary requested from "db_interface" return self.__db_interface.get_board(self.__width) def clear_board(self) -> None: # instruct "db_interface" to reset entries in database self.__db_interface.clear_board() def get_last_move(self) -> tuple: # instruct "db_interface" to return the last move return self.__db_interface.get_last_move() def get_history(self) -> list: # instruct "db_interface" to get the entire move history as a list of tuples return self.__db_interface.get_history() def clear_history(self) -> None: # let "db_interface" delete game history self.__db_interface.clear_history() # "@property" decorator makes variables accessible from outside this class even though they are private # if width or height need to be changed by "game", new Board object must be created @property def width(self) -> int: return self.__width @property def height(self) -> int: return self.__height
def __init__(self, session): """Takes session instance as an argument""" self.session = session self.dbi = DBInterface(self.session)
def first_purchase(recv_wrapper, seqnum): # construct a db command whnum = recv_wrapper.get_data().arrived[0].whnum product_id = recv_wrapper.get_data().arrived[0].things[0].id product_desc = recv_wrapper.get_data().arrived[0].things[0].description product_count = recv_wrapper.get_data().arrived[0].things[0].count db_command1 = 'INSERT INTO miniamazon_app_product(product_id, name, description, price, ' + \ 'storage) VALUES(' + str(product_id) + ', \'' + product_desc + \ '\', \'' + product_desc + '\', ' + str(50) + ', ' + str(product_count) + ')' + \ ' ON CONFLICT DO NOTHING' db_command2 = 'INSERT INTO miniamazon_app_product_warehouse(product_id, warehouse_id) VALUES(' + \ str(product_id) + ', ' + str(whnum) + ')' + ' ON CONFLICT DO NOTHING' db_command3 = 'INSERT INTO miniamazon_app_localstorage(storage, warehouse_id) VALUES(' + str(product_count) + \ ', ' + str(whnum) + ') ON CONFLICT DO NOTHING RETURNING local_storage_id' # update database db_interface = DBInterface(cfg.db_name, cfg.db_user, cfg.db_password, cfg.db_host, cfg.db_port) db_interface.setup_excecute_and_close(db_command1) db_interface.setup_excecute_and_close(db_command2) db_interface.setup() db_interface.execute(db_command3) local_storage_id = db_interface.cursor.fetchone()[0] db_interface.close() db_command4 = 'INSERT INTO miniamazon_app_localstorage_product(localstorage_id, product_id) ' + \ 'VALUES('+ str(local_storage_id) + ', ' + str(product_id) + ') ON CONFLICT DO NOTHING' db_interface.setup_excecute_and_close(db_command4)
def main(): if len(sys.argv) != 2: sys.exit("Wrong number of arguments!\n\tExiting") global beaconTable global beaconTableLocker global database global configFileContent global configFCLocker global configFileName global pubTopic print("Initializing server") configFileName = sys.argv[1] configFileContent = json.load(open(configFileName)) beaconTable = dict() beaconTableLocker = Lock() configFCLocker = Lock() tmpIds = [] for p in configFileContent["positions"]: tmpIds.append(p) for b in configFileContent["devices"].values(): beaconTable[b] = BeaconInfo(b, tmpIds) # Instantiate Broker broker_address = configFileContent["broker-ip"] subTopic = configFileContent["subscribe_topic"] pubTopic = configFileContent["publish_topic"] print("Init broker") client = mqtt.Client("P1") client.connect(broker_address) print("Subscription to " + broker_address + " on topic " + subTopic) client.subscribe(subTopic) client.on_message = on_message client.loop_start() # Initiate connection with MongoDB database = DBInterface(configFileContent["DB_connection_params"]) # Activate triangulator thread triangulate = Triangulate(int(configFileContent["algorithm-interval"]), beaconTable, beaconTableLocker, database) triangulate.daemon = True triangulate.start() # Activating web-server thread webServer = WebServer(webApp, configFileContent["server-ip"], int(configFileContent["server-port"])) webServer.daemon = True webServer.start() print("Starting loop") while True: time.sleep(1)
class DBInterfaceTest(unittest.TestCase): """docstring for DBInterfaceTest""" def setUp(self): call("py create_test_database.py", shell=True) self.dbi = DBInterface("cinema_test.db") self.movies = movies_data() self.projections = projections_data() self.reservations = reservations_data() def test_get_movies(self): self.movies = sorted(self.movies, key=lambda k: k['rating'], reverse=True) self.assertEqual(self.movies, self.dbi.get_movies_by_rating()) def test_get_movie_projections(self): expected = [{ "id": 1, "movie_id": 1, "type": "3D", "date": "2014-04-01", "time": "19:10" }, { "id": 3, "movie_id": 1, "type": "2D", "date": "2014-04-01", "time": "19:00" }, { "id": 2, "movie_id": 1, "type": "4DX", "date": "2014-04-02", "time": "21:00" }] self.assertEqual(expected, self.dbi.get_movie_projections(1)) def test_get_movie_projections_for_date(self): expected = [{ "id": 1, "movie_id": 1, "type": "3D", "date": "2014-04-01", "time": "19:10" }, { "id": 3, "movie_id": 1, "type": "2D", "date": "2014-04-01", "time": "19:00" }] self.assertEqual(expected, self.dbi.get_movie_projections(1, "2014-04-01")) def test_get_projection_seats(self): matrix = self.dbi.get_matrix() matrix[1][0] = 1 matrix[2][4] = 1 matrix[6][7] = 1 self.assertEqual(matrix, self.dbi.get_projection_seats(1)) def test_get_projection_remaining_seats(self): self.assertEqual(97, self.dbi.get_projection_remaining_seats(1)) def test_get_reservation_by_name(self): expected = [{ "id": 1, "username": "******", "projection_id": 1, "row": 2, "col": 1 }, { "id": 2, "username": "******", "projection_id": 1, "row": 3, "col": 5 }, { "id": 3, "username": "******", "projection_id": 1, "row": 7, "col": 8 }] self.assertEqual(expected, self.dbi.get_reservation_by_name("RadoRado")) def test_delete_reservation_by_name(self): self.assertTrue(self.dbi.delete_reservation_by_name("RadoRado")) self.assertEqual([], self.dbi.get_reservation_by_name("RadoRado")) def tearDown(self): try: os.remove("cinema_test.db") except OSError: pass
def setUp(self): call("py create_test_database.py", shell=True) self.dbi = DBInterface("cinema_test.db") self.movies = movies_data() self.projections = projections_data() self.reservations = reservations_data()
from fileoperations import CSVOperations from crawler.article_crawler import ArticelCrawler from scrapy.crawler import CrawlerProcess from db_interface import DBInterface from crawler.url import URL # insert crawled articles into the db dbInterface = DBInterface() if not dbInterface.article_table_already_exists(): print('Created Table articles') dbInterface.create_articles_table() else: print('Recreated Table articles') dbInterface.delete_articles_table() dbInterface.create_articles_table() urls = dbInterface.get_urls() ArticelCrawler.urls.extend(urls) # start the crawler process = CrawlerProcess( {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) process.crawl(ArticelCrawler) process.start() # the script will block here until the crawling is finished print('failed URLs are:') print(ArticelCrawler.get_failed_urls())
def start_handler(message): answer = '\n'.join([config.hello_message, config.help_message]) with DBInterface(config.database_name) as db: db.add_chat(message.chat.id) bot.send_message(message.chat.id,**config.md_mes(answer))
def test_check_molecule_non_existing(self): SMILES='CCCCC' dbi = DBInterface(self.ses) with pytest.raises(NoResultFound) as e: dbi.get_molecule(SMILES)
class ArticelCrawler(scrapy.Spider): name = "article" allowed_domains = ["zeit.de"] handle_httpstatus_list = [404] urls = [] articles = [] failed_urls = [] db_interface = DBInterface() def start_requests(self): print('crawling....') count = 0 for url in self.urls: yield scrapy.Request( url=url[0] + COMPLETE_ARTICLE_URL_SUBDOMAIN, headers={'referer': 'https://www.facebook.com/zeitonline/'}, callback=self.parse, method='GET', meta={ ID_IDENTIFIER: count, URL_IDENTIFIER: url[0] }, ) count += 1 def __init__(self): super().__init__(self) def parse(self, response): if response.status != HTTP_RESPONSE_OK: self.failed_urls.append( [response.meta[ID_IDENTIFIER], response.status, response.url]) else: article = (self._create_article_from_response(response)) if article.get_body() != "": self.db_interface.insert_article(article) @staticmethod def get_failed_urls(): return ArticelCrawler.failed_urls @staticmethod def get_articles(): return ArticelCrawler.articles # creates an article based on the crawler-response def _create_article_from_response(self, response): article = Article() article.set_id(response.meta[ID_IDENTIFIER]) article.set_url(response.meta[URL_IDENTIFIER]) sel = Selector(response) heading = "" text_areas = sel.css( Article.XPATH_ARTICLE_HEADING).xpath('*//div//text()').extract() for t in text_areas: heading += t heading = self._filter_unnecessary_linebreaks(heading) article.set_heading(self._filter_text_from_markup(heading)) ressort = response.xpath(Article.XPATH_RESSORT).extract_first() if ressort is not None: article.set_ressort(self._filter_text_from_markup(ressort).lower()) else: self._parse_html_head_and_set_ressort(response, article) paragraphs = sel.css('div[itemprop="articleBody"]').xpath( '*//p//text()').extract() body = "" for p in paragraphs: body += p body = self._filter_unnecessary_linebreaks(body) article.set_body(body) return article # removes markup-tags from the given text def _filter_text_from_markup(self, markup): return remove_tags(remove_tags_with_content(markup, ('script', ))) def _filter_unnecessary_linebreaks(self, text): text = text.rstrip() return text.replace('\n', '').replace('\r', '') # parses the html-header in order to find ressorts in the scripts for the given article def _parse_html_head_and_set_ressort(self, response, article): header = response.xpath(Article.XPATH_ARTICLE_HEAD)[0].extract() # extracts all occurrences of 'ressort': "..." or 'sub_ressort': "..." in the html-header in order # to get the ressort ressort = self._find_ressort_by_regex('\'ressort\': "(.+)"', header) if (ressort is None): ressort = self._find_ressort_by_regex('\'sub_ressort\': "(.+)"', header) # set the specific ressort article.set_ressort(ressort) def _find_ressort_by_regex(self, regex, text): ressortMatch = re.search(regex, text) ressort = None if ressortMatch is not None: # the string 'ressort': "politik" is trimmed to politik ressort = re.search('"(.+)"', ressortMatch.group(0)).group(0).replace( '"', '') return ressort
print( "Usage: " + sys.argv[0] + " option [params]" ) print( "" ) print( "Options:" ) print( " portscan ipfrom ipto [port list]" ) print( " virtualhosts ipfrom ipto" ) print( " httpcrawl [robots=True] [re-scan]" ) sys.exit(1) option = sys.argv[1] if option == "portscan": ipfrom = sys.argv[2] ipto = sys.argv[3] ports = [ int(x) for x in sys.argv[4:] ] if len(ports) == 0: ports = [21,22,25,80,443,8080] db = DBInterface() dnspool = Pool_Scheduler(1,DNS_Solver) portscanpool = Pool_Scheduler(3,Port_Scanner,dnspool,db) iplist = list(ip_crawler.iterateIPRange(ipfrom,ipto)) compoud_list = [ (x, ports) for x in iplist ] # Perfom port scan, limit outstanding jobs (Linux usually limits # of open files to 1K) We need 100k soft and 200k hard of limit :) max_jobs = 60000 batch_size = 1000 print( "Starting ..." ) try: while len(compoud_list) != 0: nj = portscanpool.numActiveJobs() t = time.time() while nj < max_jobs-batch_size and len(compoud_list) != 0 and time.time()-t < 10: