def _main(arguments=sys.argv):
    # if version is specified ignore any other arguments
    if '--version' in arguments or '-v' in arguments:
        return make_exit(message="hepdata-converter version: %s" % version.__version__)

    parser = argparse.ArgumentParser(description="CLI tools for converting between HEP data formats", add_help=True,
                                     formatter_class=argparse.RawTextHelpFormatter,
                                     epilog=generate_help_epilogue())
    parser.add_argument("--input-format", '-i', action='store', default='yaml', help='format of the input file/s (default: yaml) [choose one option from Parsers section below]')
    parser.add_argument("--output-format", '-o', action='store', default='yaml', help='format of the output file/s (default: yaml) [choose one option from Writers section below]')
    parser.add_argument("--version", '-v', action='store_const', const=True, default=False, help='Show hepdata-converter version')
    parser.add_argument("--hepdata-doi", '-d', action='store', default='', help='Pass HEPData DOI, e.g. "10.17182/hepdata.74247.v1"')
    parser.add_argument("input")
    parser.add_argument("output")

    if arguments == sys.argv:
        arguments = sys.argv[1:]

    program_args = vars(parser.parse_known_args(arguments)[0])

    input_format = program_args['input_format']
    output_format = program_args['output_format']

    Parser.get_concrete_class(input_format).register_cli_options(parser)
    Writer.get_concrete_class(output_format).register_cli_options(parser)

    # reparse arguments, now with added options from concrete parsers / writers
    program_args = vars(parser.parse_args(arguments))
    try:
        convert(program_args['input'], program_args['output'], program_args)
        return make_exit()
    except ValueError as e:
        return make_exit(message="Options error: %s" % str(e), code=1)
    def test_parse_index_row_returns_dict(self):
        p = Parser()
        elem = make_tree(SAMPLE_ROW)

        result = p.parse_index_row(elem)

        self.assertEqual(type(result), dict)
Exemple #3
0
def _main(arguments=sys.argv):
    # if version is specified ignore any other arguments
    if '--version' in arguments or '-v' in arguments:
        return make_exit(message="hepdata-converter version: %s" %
                         version.__version__)

    parser = argparse.ArgumentParser(
        description="CLI tools for converting between HEP data formats",
        add_help=True,
        formatter_class=argparse.RawTextHelpFormatter,
        epilog=generate_help_epilogue())
    parser.add_argument(
        "--input-format",
        '-i',
        action='store',
        default='yaml',
        help=
        'format of the input file/s (default: yaml) [choose one option from Parsers section below]'
    )
    parser.add_argument(
        "--output-format",
        '-o',
        action='store',
        default='yaml',
        help=
        'format of the output file/s (default: yaml) [choose one option from Writers section below]'
    )
    parser.add_argument("--version",
                        '-v',
                        action='store_const',
                        const=True,
                        default=False,
                        help='Show hepdata-converter version')
    parser.add_argument(
        "--hepdata-doi",
        '-d',
        action='store',
        default='',
        help='Pass HEPData DOI, e.g. "10.17182/hepdata.74247.v1"')
    parser.add_argument("input")
    parser.add_argument("output")

    if arguments == sys.argv:
        arguments = sys.argv[1:]

    program_args = vars(parser.parse_known_args(arguments)[0])

    input_format = program_args['input_format']
    output_format = program_args['output_format']

    Parser.get_concrete_class(input_format).register_cli_options(parser)
    Writer.get_concrete_class(output_format).register_cli_options(parser)

    # reparse arguments, now with added options from concrete parsers / writers
    program_args = vars(parser.parse_args(arguments))
    try:
        convert(program_args['input'], program_args['output'], program_args)
        return make_exit()
    except ValueError as e:
        return make_exit(message="Options error: %s" % str(e), code=1)
Exemple #4
0
    def __init__(self):
        self.r_helper = redis_helper()
        self.g = Graph(host="127.0.0.1",
                       http_port=7474,
                       user="******",
                       password="******")
        self.num_limit = 20

        self.rela2Entity = {
            "rel_clauses": "Clauses",
            "rel_scope": "Scope",
            "rel_type": "Insur_type"
        }

        self.parse = Parser()

        self.conf = configparser.ConfigParser()
        self.conf.read(cur_dir + '/../config/kg_bot.conf')
        self.form_reg = re.compile(self.conf.get("askProperty", "form_reg"))
        self.term_reg = re.compile(self.conf.get("askProperty", "term_reg"))
        self.price_reg = re.compile(self.conf.get("askProperty", "price_reg"))
        self.sale_reg = re.compile(self.conf.get("askProperty", "sale_reg"))
        self.crowd_reg = re.compile(self.conf.get("askProperty", "crowd_reg"))
        self.url_reg = re.compile(self.conf.get("askProperty", "url_reg"))

        self.amount_reg = re.compile(self.conf.get("askProperty",
                                                   "amount_reg"))
        self.info_reg = re.compile(self.conf.get("askProperty", "info_reg"))

        self.clause_reg = re.compile(self.conf.get("askEntity", "clause_reg"))
        self.scope_reg = re.compile(self.conf.get("askEntity", "scope_reg"))
        self.type_reg = re.compile(self.conf.get("askEntity", "type_reg"))
        self.entity_reg = re.compile(self.conf.get("askEntity", "entity_reg"))

        self.rela_reg = re.compile(self.conf.get("askRela", "rela_reg"))
Exemple #5
0
def parser():
    global a_date
    parsed_tasks_links = []

    Parser.get_gdoc_config('1VGObmBB7RvgBtBUGW7lXVPvm6_m96BJpjFIH_qkZGBM')

    while True:
            a_date = datetime.date.today()
            set_file_logger()

        parsed_tasks = []
        for batch in Parser.parse_all():
            parsed_tasks.extend(batch)
        new_tasks = [task for task in parsed_tasks if
                     task['link'] not in parsed_tasks_links and not db_handler.check_task_link(task['link'])]
        logger.debug(f"New tasks {[task['link'] for task in new_tasks]}")

        for task in new_tasks:
            task = format_task(task)
            print(f"{', '.join([task['title'], task['price'], task['currency'], task['price_format']])}")
            logger.debug(f"Sending task {task['link']} to db")
            db_handler.add_task(task)
        tasks_sender(new_tasks)
        parsed_tasks_links = [task['link'] for task in parsed_tasks]
        time.sleep(5)
    def test_parse_index_table_returns_all_rows(self):
        row = '<tr class="tCenter hl-tr"></tr>'
        html = '<table id="tor-tbl">' + row * 10 + '</table>'
        p = Parser()

        rows = p.parse_index_table(html)

        self.assertEqual(len(rows), 10)
Exemple #7
0
def parse():
    # parse gathered data and save as csv

    logger.info("parse")
    storage = Persistor()
    parser = Parser()

    raw_data = storage.read_raw_data()
    parsed_files = parser.parse_object(raw_data)
    storage.save_csv(parsed_files)
    def test_pid(self):
        p = Parser()
        res = p.parse(self.f, None, False)

        for pid, trace in res.iteritems():
            for timestamp, data in trace.iteritems():
                self.assertTrue(pid == data['pid'])
                self.assertTrue(timestamp == data['timestamp'])

                for item in ('path', 'timestamp', 'type'):
                    self.assertTrue(item in data)
    def test_main(self):
        p = Parser()

        res = p.parse(self.f, None, False)

        self.assertTrue(len(res) == 6)

        for e in res:
            self.assertTrue(isinstance(e, dict))
            self.assertTrue(isinstance(e['path'], basestring))
            self.assertTrue(isinstance(e['version'], basestring))
Exemple #10
0
 def run(self):
     parser = Parser()
     while True:
         sourceStr = raw_input("Enter an infix expression: ")
         if sourceStr == "": break
         try:
             parser.parse(sourceStr)
             print parser.parseStatus()
         except Exception, e:
             print "Error:"
             print e
Exemple #11
0
 def run(self):
     parser = Parser()
     while True:
         sourceStr = input(
             "Enter an arithmetic expression or just enter to quit: ")
         if sourceStr == "": break
         try:
             parser.parse(sourceStr)
             print(parser.parseStatus())
             print(parser.tree)
         except Exception as e:
             print("Error:")
             print(e)
def identifyCorpus(corpus, x=-1):
    """ update corpus with mwedictionaries (type, count, tokens), 
        train, predict and evaluate corpus
    """
    print(XPParams.use_extern_labels)
    if XPParams.use_extern_labels:
        Parser.parse(corpus, "")  # -> prediction
        scores = Evaluation.evaluate(corpus)  # -> evaluate
    else:
        corpus.update()
        clf = EmbeddingOracle.train(corpus, x)  # -> training
        Parser.parse(corpus, clf)  # -> prediction
        scores = Evaluation.evaluate(corpus)  # -> evaluate
    return scores
Exemple #13
0
 def run(self):
     parser = Parser()
     while True:
         sourceStr = input("Enter an infix expression: ")
         if sourceStr == "": break
         try:
             tree = parser.parse(sourceStr)
             print("Prefix:", tree.prefix())
             print("Infix:", tree.infix())
             print("Postfix:", tree.postfix())
             print("Value:", tree.value())
         except Exception as e:
             print("Error:")
             print(e)
    def test_get_parser(self):
        p = Parser()

        fun = p.get_parser()

        self.assertTrue(fun == None)

        for ext, klass in PARSERS_CONFIG.iteritems():
            p = Parser()
            p.ext = ext
            fun = p.get_parser()
            self.assertTrue(fun != None)
            fun_class = load_by_name(klass)
            self.assertTrue(isinstance(fun, fun_class))
Exemple #15
0
 def run(self):
     parser = Parser()
     while True:
         sourceStr = input("Enter an infix expression: ")
         if sourceStr == "": break
         try:
             tree = parser.parse(sourceStr)
             print("Prefix:", tree.prefix())
             print("Infix:", tree.infix())
             print("Postfix:", tree.postfix())
             print("Value:", tree.value())
         except Exception as e:
             print("Error:")
             print(e)
Exemple #16
0
def main():

    parser = Parser(DATASET)

    src_prep = SrcPreprocessing(parser.src_parser())
    src_prep.preprocess()
    with open(DATASET.root / 'preprocessed_src.pickle', 'wb') as file:
        pickle.dump(src_prep.src_files, file, protocol=pickle.HIGHEST_PROTOCOL)

    report_prep = ReportPreprocessing(parser.report_parser())
    report_prep.preprocess()
    with open(DATASET.root / 'preprocessed_reports.pickle', 'wb') as file:
        pickle.dump(report_prep.bug_reports, file,
                    protocol=pickle.HIGHEST_PROTOCOL)
    def test_main(self):
        p = Parser()
        res = p.parse(self.f, None, False)

        self.assertTrue(len(res) == 9)

        for grp, data in res.iteritems():
            self.assertTrue(isinstance(data, list))

            for symbol in data:
                self.assertTrue(isinstance(symbol, dict))
                self.assertTrue(isinstance(symbol['name'], basestring))
                self.assertTrue(
                    isinstance(symbol['offset'], basestring) or \
                    isinstance(symbol['offset'], int))
Exemple #18
0
class Interpreter(object):
    def __init__(self, w_m, profile):
        self.parser = Parser()
        self.profile = profile
        self.w_m = w_m
        self.k_b = KnowledgeBase()

    def interpret(self, stimulus):
        self.w_m.push_input()

        parsed_data = self.parser.parse(stimulus)

        classes = []
        for wrd in parsed_data:
            matched = self.k_b.match(wrd)
            if matched:
                classes.append(matched[0])

        for cls in classes:
            word_class = cls.get_word_class()
            if word_class == 'noun':
                self.w_m.percept.nouns.append(cls)
            elif word_class == 'verb':
                self.w_m.percept.verbs.append(cls)
            elif word_class == 'adjective':
                self.w_m.percept.adjectivs.append(cls)
            elif word_class == 'adverb':
                self.w_m.percept.adverbs.append(cls)
Exemple #19
0
	def __init__(self, url, raw_html, step, lang="en"):
		self.status = True
		self.url = url
		self.step = step
		self.lang = lang
		
		# title of the article
		self.title = None	
		#text
		self.article = u""
		self.cleaned_text = u""
		# meta
		self.meta_description = u""
		self.meta_lang = u""
		self.meta_favicon = u""
		self.meta_keywords = u""
		#link and domain
		self.canonical_link = u""
		self.domain = u""
		# cleaned text
		self.top_node = None
		self.tags = set()
		self.final_url = url
		self.raw_html = raw_html
		# the lxml Document object
		self.parser = Parser()
		self.raw_doc = u""
		self.publish_date = None
		self.additional_data = {}
		self.links = []
		self.outlinks = []
		self.inlinks = []
		self.start_date = datetime.datetime.today()
Exemple #20
0
def convert(input, output=None, options={}):
    """Converts arbitrary supported data format to the new HEPData YAML format, and writes the output files
    to the output_dir directory

    :param input: input, depending on the choosen input datatype it may be, filepath, filetype object, directory, etc
    :param output: output directory to which converted YAML files will be written
    :param input_format: format of the input data, must be a string containing name of the input parser class
    :type input_format: str
    :param options: additional options used for conversion (depends on the choosen input format)
    :type options: dict
    :raise ValueError: raised if no input_format is specified
    """

    if 'input_format' not in options and 'output_format' not in options:
        raise ValueError("no input_format and output_format specified!")

    input_format = options.get('input_format', 'yaml')
    output_format = options.get('output_format', 'yaml')

    parser = Parser.get_concrete_class(input_format)(**options)
    writer = Writer.get_concrete_class(output_format)(**options)

    if not output and not writer.single_file_output:
        raise ValueError("this output_format requires specifying 'output' argument")

    # if no output was specified create proxy output to which writer can insert data
    _output = output
    if not _output:
        _output = StringIO.StringIO()

    writer.write(parser.parse(input), _output)

    # if no output was specified return output
    if not output:
        return _output.getvalue()
Exemple #21
0
 def parse(self, path):
     parser = Parser.get(path)
     self.search_method = parser.search_method # todo: fix
     row = parser.next()
     while row:
         self.rows.append(row)
         row = parser.next()
Exemple #22
0
 def __init__(self, log, conf, name):
     Parser.__init__(self, log)
     self.name = name
     self.conf = conf
     self.sources = {}
     self.matches = 0
     if 'sources' not in self.conf: raise Exception('Invalid configuration for '+self.name+' match')
     if 'workers' not in self.conf: raise Exception('Field workers not specified in '+self.name+' match')
     if type(self.conf['workers']) != int:
         self.log.error('Invalid value for field workers in '+self.name+' match, setting to 1')
         self.conf['workers'] = 1
     if self.conf['workers'] < 1:
         self.log.error('Invalid value for field workers in '+self.name+' match, setting to 1')
         self.conf['workers'] = 1
     for s in self.conf['sources'].keys():
         self.add_source(s, self.conf['sources'][s])
Exemple #23
0
class Crawler(object):
    def __init__(self):
        self.pres = Parser()
        self.proc = Processor()

    def start(self):
        """
        启动爬虫方法
        :param urls: 启动URL
        :return: 抓取的URL数量
        """

        self.getHy()
        self.getStockLHRank(2)

    #获取数据东方财富买入最多的股票。1小时
    def getStockLHRank(self, timePeriod):
        root_urls = 'https://simqry2.eastmoney.com/qry_tzzh_v2?type=spo_rank_tiger&plat=2&ver=web20&rankType=30001&timePeriod=' + str(
            timePeriod) + '&recIdx=1&recCnt=50'
        html = download(root_urls, 'utf8')
        print html
        data = self.pres.StockLHRank(html)
        self.proc.StockLHRank(data)
        #print data
        #print data[0]

    #https://simqry2.eastmoney.com/qry_tzzh_v2?type=spo_rank_tiger&plat=2&ver=web20&rankType=30001&timePeriod=3&recIdx=1&recCnt=50

    #获取行业资金流入情况
    def getHy(self):
        for p in range(1, 3):
            root_urls = 'http://nufm.dfcfw.com/EM_Finance2014NumericApplication/JS.aspx?cmd=C._BKHY&type=ct&st=BalFlowMain&sr=-1&p=' + str(
                p
            ) + '&token=894050c76af8597a853f5b408b759f5d&sty=DCFFITABK&rt=51115543'
            html = download(root_urls, 'utf8')
            '''
            html =re.findall(r'[(](.*?)[)]', html) 
            jsonObj  = json.loads(html[0])
            print   jsonObj[0].split(',')[14]
            '''
            data = self.pres.easey(html)
            self.proc.hyDb(data)
Exemple #24
0
def worker(address, host):

    parser = Parser.new()
    parser.global_set_address(address)
    parser.global_set_host(host)
    parser.global_set_timestamp(
    ) if config["reporting"]["timestamp"] == "true" else ""
    parser.global_pair("host", host["name"])

    # Identify all modules
    modules = [
        os.path.splitext(f)[0] for f in os.listdir(MODULES_PATH)
        if os.path.splitext(f)[1] == ".py" and f != "__init__.py"
    ]

    # Worker function to run module
    module_threads = []

    def module_worker(module):
        mod = import_module("modules." + module)
        mod.run(parser)

    for module in modules:

        # Check if module is activated in host settings
        if module in host["modules"]:
            # Use host-specific settings
            setting_enabled = host["modules"][module][0]
            setting_tick = host["modules"][module][1]
        else:
            # Use global settings
            try:
                setting_enabled = config["global"]["modules"][module][0]
                setting_tick = config["global"]["modules"][module][1]
            except:
                continue  # TODO , add warning?

        # Check if module is active in global and in host settings
        if setting_enabled == "false" or ((tick % setting_tick) != 0):
            continue

        # Start thread
        thread = Thread(target=module_worker, args=(module, ))
        thread.start()
        module_threads.append(thread)

    # Wait for all module threads to complete
    for thread in module_threads:
        thread.join()

    results.append(parser.get())
    def test_parse(self):
        for f in self.files:
            p = Parser()
            res = p.parse(f, None, False)

            self.assertTrue(
                isinstance(res, dict) or \
                isinstance(res, list) or \
                res is None)

            res = p.parse(f, None, True)

            self.assertTrue(isinstance(res, tuple))
            self.assertTrue(len(res) == 2)

            res = p.parse(f, None, False)

            def_parser = PARSERS_CONFIG.get(p.ext, None)

            cond = def_parser != None and res != {} or\
                   def_parser == None and res == {}

            self.assertTrue(cond)
Exemple #26
0
    def __init__(self):
        super().__init__(parent=None,
                         title='ChoseFile',
                         size=(640, 535),
                         style=wx.SYSTEM_MENU | wx.CAPTION | wx.CLOSE_BOX
                         | wx.CLIP_CHILDREN)

        # ui
        self.init_ui()

        # config
        self.setting = Setting()

        # default
        self.init_default(self.setting)

        # logger
        self.logger = Logger(self.console_text)

        # parser
        self.parser = Parser(self.logger)

        # processor
        self.processor = Processor(self.logger, self.setting)
Exemple #27
0
def run():
    # 添加项目目录到环境变量
    sys.path.append(settings.BASE_DIR)
    # 连接redis,实例化spider, parse, checker
    redis_cli = redis.Redis(**settings.REDIS_PARAM)
    spider = Spider()
    parser = Parser(spider.queue, redis_cli)
    checher = Checker(redis_cli)
    # 创建并启动线程
    thread_list = [
        Thread(target=spider.run),
        Thread(target=parser.run),
        Thread(target=checher.run)
    ]
    for thread in thread_list:
        thread.start()
Exemple #28
0
 def run(self):
     parser = Parser()
     while True:
         sourceStr = raw_input("Enter an infix expression: ")
         if sourceStr == "": break
         try:
             parser.parse(sourceStr)
             print parser.parseStatus()
         except Exception, e:
             print "Error:"
             print e
Exemple #29
0
def generate_help_epilogue():
    margin = '   '

    r = 'Parsers:\n'
    r += '[use them as --input-format parameter]\n'
    r += '\n'

    for cls in Parser.get_all_subclasses():
        r += cls.get_help(margin)

    r += '\nWriters:\n'
    r += '[use them as --output-format parameter]\n'
    r += '\n'

    for cls in Writer.get_all_subclasses():
        r += cls.get_help(margin)

    return r
Exemple #30
0
def generate_help_epilogue():
    margin = '   '

    r = 'Parsers:\n'
    r += '[use them as --input-format parameter]\n'
    r += '\n'

    for cls in Parser.get_all_subclasses():
        r += cls.get_help(margin)

    r += '\nWriters:\n'
    r += '[use them as --output-format parameter]\n'
    r += '\n'

    for cls in Writer.get_all_subclasses():
        r += cls.get_help(margin)

    return r
Exemple #31
0
def parse_doc_section(section, dom):
    parser = Parser(dom)

    unhandled_count = 0
    handled_count = 0
    for para in section["paragraphs"]:
        para['text'] = _para_text_content(para)
        if not para['text']:
            continue
        success = parser(para)
        if not success and para['text']:
            unhandled_count += 1
            print('unhandled para {}:'.format(para['index']),
                  para,
                  '\n',
                  file=sys.stderr)
        elif success:
            handled_count += 1
    print('handled paras: {}'.format(handled_count), file=sys.stderr)
    print('unhandled paras: {}'.format(unhandled_count), file=sys.stderr)
Exemple #32
0
def parse_doc_section(section, dom):
    def prep_para(para):
        para['text'] = _para_text_content(para)

        def next_para():
            paras = section['paragraphs']
            next_index = para['index'] - paras[0]['index'] + 1
            if next_index >= len(paras):
                return None
            next_p = prep_para(copy.deepcopy(paras[next_index]))
            if matchers.empty(next_p):
                next_p = next_p['next']()
            return next_p

        para['next'] = next_para
        return para

    parser = Parser(dom)

    unhandled_count = 0
    handled_count = 0
    for para in section["paragraphs"]:
        prep_para(para)
        if not para['text']:
            continue
        success = parser(para)
        if not success and para['text']:
            unhandled_count += 1
            print('unhandled para {}:'.format(para['index']),
                  para,
                  '\n',
                  file=sys.stderr)
        elif success:
            handled_count += 1
    print('handled paras: {}'.format(handled_count), file=sys.stderr)
    print('unhandled paras: {}'.format(unhandled_count), file=sys.stderr)
Exemple #33
0
def convert(input, output=None, options={}):
    """Converts a supported ``input_format`` (*oldhepdata*, *yaml*)
    to a supported ``output_format`` (*csv*, *root*, *yaml*, *yoda*).

    :param input: location of input file for *oldhepdata* format or input directory for *yaml* format
    :param output: location of output directory to which converted files will be written
    :param options: additional options such as ``input_format`` and ``output_format`` used for conversion
    :type input: str
    :type output: str
    :type options: dict
    :raise ValueError: raised if no ``input_format`` or ``output_format`` is specified
    """

    if 'input_format' not in options and 'output_format' not in options:
        raise ValueError("no input_format and output_format specified!")

    input_format = options.get('input_format', 'yaml')
    output_format = options.get('output_format', 'yaml')

    parser = Parser.get_concrete_class(input_format)(**options)
    writer = Writer.get_concrete_class(output_format)(**options)

    if not output and not writer.single_file_output:
        raise ValueError(
            "this output_format requires specifying 'output' argument")

    # if no output was specified create proxy output to which writer can insert data
    _output = output
    if not _output:
        _output = StringIO.StringIO()

    writer.write(parser.parse(input), _output)

    # if no output was specified return output
    if not output:
        return _output.getvalue()
 def __init__(self, filename):
     Parser.__init__(self, filename, ".date.csv")
    def test_trace(self):
        p = Parser()
        res = p.parse(self.f, None, False)

        self.assertTrue(res['trace'] is not None)
        self.assertTrue(len(res['trace']) == 268)
Exemple #36
0
log.info('Application logger initialized')


# Cache registration with Flask app
cache.init_app(app)
log.info('Cache initialized')


# Database registration with Flask app
db.app = app
db.init_app(app)
log.info('DB initialized')


# Parser initialization
parser = Parser()
counter = 0
log.info('Parser initialized')


# DB structure initialization and parser first run
with app.app_context():
    db.create_all()
    # running parser on first start of app
    if Rate.query.all().__len__() == 0:
        parser.get_rates()


# Scheduled job
def run_schedule():
    global counter
 def __init__(self, filename):
     Parser.__init__(self, filename, ".withdrawal.csv")
Exemple #38
0
 def __init__(self, w_m, profile):
     self.parser = Parser()
     self.profile = profile
     self.w_m = w_m
     self.k_b = KnowledgeBase()
Exemple #39
0
'''main.py main method.'''
from parsers import Parser

if __name__ == '__main__':

  parser = Parser('fallbackdfa.in', 'Lab2.in')
  dfa = parser.parse_dfa()
  test_outputs = []
  test_inputs = parser.parse_test_inputs()
  for test_input_name, test_input_data in test_inputs:
    print('--', test_input_name)
    test_output = dfa.run(test_input_data)
    test_outputs.append((test_input_name.replace('in', 'out'), test_output))
  with open('Lab2-2.out', 'w') as f:
    for test_output_name, test_output_data in test_outputs:
      # print('---', test_output_name)
      # print(test_output_data)
      formatted_test_output = ['%s,%s' % (state.lexical_category_name, lexeme) for state, lexeme in test_output_data]
      if formatted_test_output:
        formatted_test_output = '%s\n%s' % (test_output_name, ' '.join(formatted_test_output))
      else:
        formatted_test_output = test_output_name + "\nError: Input lexemes don't match the language!"
      f.write(formatted_test_output)
      f.write('\n')
Exemple #40
0
    def run(
        self
    ):  # it is necessary to get the qprocess because we need to send it back to the scheduler when we're done importing
        try:
            print "[+] Parsing nmap xml file: " + self.filename
            starttime = time.time()

            try:
                parser = Parser(self.filename)
            except:
                print '\t[-] Giving up on import due to previous errors.'
                print "\t[-] Unexpected error:", sys.exc_info()[0]
                self.done.emit()
                return

            self.db.dbsemaphore.acquire(
            )  # ensure that while this thread is running, no one else can write to the DB
            s = parser.get_session()  # nmap session info
            if s:
                nmap_session(self.filename, s.start_time, s.finish_time,
                             s.nmap_version, s.scan_args, s.total_hosts,
                             s.up_hosts, s.down_hosts)
            hostCount = len(parser.all_hosts())
            if hostCount == 0:  # to fix a division by zero if we ran nmap on one host
                hostCount = 1
            progress = 100.0 / hostCount
            totalprogress = 0
            self.tick.emit(int(totalprogress))

            for h in parser.all_hosts(
            ):  # create all the hosts that need to be created
                db_host = nmap_host.query.filter_by(ip=h.ip).first()

                if not db_host:  # if host doesn't exist in DB, create it first
                    hid = nmap_host('', '', h.ip, h.ipv4, h.ipv6, h.macaddr,
                                    h.status, h.hostname, h.vendor, h.uptime,
                                    h.lastboot, h.distance, h.state, h.count)
                    note(hid, '')

            session.commit()

            for h in parser.all_hosts(
            ):  # create all OS, service and port objects that need to be created

                db_host = nmap_host.query.filter_by(
                    ip=h.ip).first()  # fetch the host

                os_nodes = h.get_OS()  # parse and store all the OS nodes
                for os in os_nodes:
                    db_os = nmap_os.query.filter_by(
                        host_id=db_host.id).filter_by(name=os.name).filter_by(
                            family=os.family).filter_by(
                                generation=os.generation).filter_by(
                                    os_type=os.os_type).filter_by(
                                        vendor=os.vendor).first()

                    if not db_os:
                        nmap_os(os.name, os.family, os.generation, os.os_type,
                                os.vendor, os.accuracy, db_host)

                for p in h.all_ports():  # parse the ports
                    s = p.get_service()

                    if not (
                            s is None
                    ):  # check if service already exists to avoid adding duplicates
                        db_service = nmap_service.query.filter_by(
                            name=s.name).filter_by(
                                product=s.product).filter_by(
                                    version=s.version).filter_by(
                                        extrainfo=s.extrainfo).filter_by(
                                            fingerprint=s.fingerprint).first()

                        if not db_service:
                            db_service = nmap_service(s.name, s.product,
                                                      s.version, s.extrainfo,
                                                      s.fingerprint)

                    else:  # else, there is no service info to parse
                        db_service = None
                        # fetch the port
                    db_port = nmap_port.query.filter_by(
                        host_id=db_host.id).filter_by(
                            port_id=p.portId).filter_by(
                                protocol=p.protocol).first()

                    if not db_port:
                        db_port = nmap_port(p.portId, p.protocol, p.state,
                                            db_host, db_service)

            session.commit()

            totalprogress += progress
            self.tick.emit(int(totalprogress))

            for h in parser.all_hosts(
            ):  # create all script objects that need to be created

                db_host = nmap_host.query.filter_by(ip=h.ip).first()

                for p in h.all_ports():
                    for scr in p.get_scripts():

                        db_port = nmap_port.query.filter_by(
                            host_id=db_host.id).filter_by(
                                port_id=p.portId).filter_by(
                                    protocol=p.protocol).first()
                        db_script = nmap_script.query.filter_by(
                            script_id=scr.scriptId).filter_by(
                                port_id=db_port.id).first()

                        if not db_script:  # if this script object doesn't exist, create it
                            nmap_script(scr.scriptId, scr.output, db_port,
                                        db_host)

                for hs in h.get_hostscripts():
                    db_script = nmap_script.query.filter_by(
                        script_id=hs.scriptId).filter_by(
                            host_id=db_host.id).first()
                    if not db_script:
                        nmap_script(hs.scriptId, hs.output, None, db_host)

            session.commit()

            for h in parser.all_hosts():  # update everything

                db_host = nmap_host.query.filter_by(ip=h.ip).first(
                )  # get host from DB (if any with the same IP address)

                if db_host.ipv4 == '' and not h.ipv4 == '':
                    db_host.ipv4 = h.ipv4
                if db_host.ipv6 == '' and not h.ipv6 == '':
                    db_host.ipv6 = h.ipv6
                if db_host.macaddr == '' and not h.macaddr == '':
                    db_host.macaddr = h.macaddr
                if not h.status == '':
                    db_host.status = h.status
                if db_host.hostname == '' and not h.hostname == '':
                    db_host.hostname = h.hostname
                if db_host.vendor == '' and not h.vendor == '':
                    db_host.vendor = h.vendor
                if db_host.uptime == '' and not h.uptime == '':
                    db_host.uptime = h.uptime
                if db_host.lastboot == '' and not h.lastboot == '':
                    db_host.lastboot = h.lastboot
                if db_host.distance == '' and not h.distance == '':
                    db_host.distance = h.distance
                if db_host.state == '' and not h.state == '':
                    db_host.state = h.state
                if db_host.count == '' and not h.count == '':
                    db_host.count = h.count

                tmp_name = ''
                tmp_accuracy = '0'  # TODO: check if better to convert to int for comparison

                os_nodes = h.get_OS()
                for os in os_nodes:
                    db_os = nmap_os.query.filter_by(
                        host_id=db_host.id).filter_by(name=os.name).filter_by(
                            family=os.family).filter_by(
                                generation=os.generation).filter_by(
                                    os_type=os.os_type).filter_by(
                                        vendor=os.vendor).first()

                    db_os.os_accuracy = os.accuracy  # update the accuracy

                    if not os.name == '':  # get the most accurate OS match/accuracy to store it in the host table for easier access
                        if os.accuracy > tmp_accuracy:
                            tmp_name = os.name
                            tmp_accuracy = os.accuracy

                if os_nodes:  # if there was operating system info to parse

                    if not tmp_name == '' and not tmp_accuracy == '0':  # update the current host with the most accurate OS match
                        db_host.os_match = tmp_name
                        db_host.os_accuracy = tmp_accuracy

                for p in h.all_ports():
                    s = p.get_service()
                    if not (s is None):
                        # fetch the service for this port
                        db_service = nmap_service.query.filter_by(
                            name=s.name).filter_by(
                                product=s.product).filter_by(
                                    version=s.version).filter_by(
                                        extrainfo=s.extrainfo).filter_by(
                                            fingerprint=s.fingerprint).first()
                    else:
                        db_service = None
                        # fetch the port
                    db_port = nmap_port.query.filter_by(
                        host_id=db_host.id).filter_by(
                            port_id=p.portId).filter_by(
                                protocol=p.protocol).first()
                    db_port.state = p.state

                    if not (
                            db_service is None
                    ):  # if there is some new service information, update it
                        db_port.service_id = db_service.id

                    for scr in p.get_scripts(
                    ):  # store the script results (note that existing script outputs are also kept)
                        db_script = nmap_script.query.filter_by(
                            script_id=scr.scriptId).filter_by(
                                port_id=db_port.id).first()

                        if not scr.output == '':
                            db_script.output = scr.output

                totalprogress += progress
                self.tick.emit(int(totalprogress))

            session.commit()
            self.db.dbsemaphore.release()  # we are done with the DB
            print '\t[+] Finished in ' + str(time.time() -
                                             starttime) + ' seconds.'
            self.done.emit()
            self.schedule.emit(
                parser, self.output == ''
            )  # call the scheduler (if there is no terminal output it means we imported nmap)

        except:
            print '\t[-] Something went wrong when parsing the nmap file..'
            print "\t[-] Unexpected error:", sys.exc_info()[0]
            self.done.emit()
Exemple #41
0
 def __init__(self, filename):
     Parser.__init__(self, filename, ".deposit.csv")
Exemple #42
0
 def __init__(self, filename):
     Parser.__init__(self, filename, ".balance.csv")
Exemple #43
0
    def run(
        self
    ):  # it is necessary to get the qprocess because we need to send it back to the scheduler when we're done importing
        try:
            self.importProgressWidget.show()
            session = self.db.session()
            self.tsLog("Parsing nmap xml file: " + self.filename)
            starttime = time()

            try:
                parser = Parser(self.filename)
            except:
                self.tsLog('Giving up on import due to previous errors.')
                self.tsLog("Unexpected error: {0}".format(sys.exc_info()[0]))
                self.done.emit()
                return

            self.db.dbsemaphore.acquire(
            )  # ensure that while this thread is running, no one else can write to the DB
            s = parser.get_session()  # nmap session info
            if s:
                n = nmap_session(self.filename, s.start_time, s.finish_time,
                                 s.nmap_version, s.scan_args, s.total_hosts,
                                 s.up_hosts, s.down_hosts)
                session.add(n)
            hostCount = len(parser.all_hosts())
            if hostCount == 0:  # to fix a division by zero if we ran nmap on one host
                hostCount = 1
            totalprogress = 0

            self.importProgressWidget.setProgress(int(totalprogress))
            self.importProgressWidget.show()

            createProgress = 0
            createOsNodesProgress = 0
            createPortsProgress = 0

            for h in parser.all_hosts(
            ):  # create all the hosts that need to be created
                db_host = session.query(nmap_host).filter_by(ip=h.ip).first()

                if not db_host:  # if host doesn't exist in DB, create it first
                    hid = nmap_host(os_match='',
                                    os_accuracy='',
                                    ip=h.ip,
                                    ipv4=h.ipv4,
                                    ipv6=h.ipv6,
                                    macaddr=h.macaddr,
                                    status=h.status,
                                    hostname=h.hostname,
                                    vendor=h.vendor,
                                    uptime=h.uptime,
                                    lastboot=h.lastboot,
                                    distance=h.distance,
                                    state=h.state,
                                    count=h.count)
                    self.tsLog("Adding db_host")
                    session.add(hid)
                    t_note = note(h.ip, 'Added by nmap')
                    session.add(t_note)
                else:
                    self.tsLog("Found db_host already in db")

                createProgress = createProgress + ((100.0 / hostCount) / 5)
                totalprogress = totalprogress + createProgress
                self.importProgressWidget.setProgress(int(totalprogress))
                self.importProgressWidget.show()

            session.commit()

            for h in parser.all_hosts(
            ):  # create all OS, service and port objects that need to be created
                self.tsLog("Processing h {ip}".format(ip=h.ip))

                db_host = session.query(nmap_host).filter_by(ip=h.ip).first()
                if db_host:
                    self.tsLog(
                        "Found db_host during os/ports/service processing")
                else:
                    self.log(
                        "Did not find db_host during os/ports/service processing"
                    )

                os_nodes = h.get_OS()  # parse and store all the OS nodes
                self.tsLog("    'os_nodes' to process: {os_nodes}".format(
                    os_nodes=str(len(os_nodes))))
                for os in os_nodes:
                    self.tsLog(
                        "    Processing os obj {os}".format(os=str(os.name)))
                    db_os = session.query(nmap_os).filter_by(
                        host_id=db_host.id).filter_by(name=os.name).filter_by(
                            family=os.family).filter_by(
                                generation=os.generation).filter_by(
                                    os_type=os.os_type).filter_by(
                                        vendor=os.vendor).first()

                    if not db_os:
                        t_nmap_os = nmap_os(os.name, os.family, os.generation,
                                            os.os_type, os.vendor, os.accuracy,
                                            db_host.id)
                        session.add(t_nmap_os)

                    createOsNodesProgress = createOsNodesProgress + (
                        (100.0 / hostCount) / 5)
                    totalprogress = totalprogress + createOsNodesProgress
                    self.importProgressWidget.setProgress(int(totalprogress))
                    self.importProgressWidget.show()

                session.commit()

                all_ports = h.all_ports()
                self.tsLog("    'ports' to process: {all_ports}".format(
                    all_ports=str(len(all_ports))))
                for p in all_ports:  # parse the ports
                    self.tsLog("        Processing port obj {port}".format(
                        port=str(p.portId)))
                    s = p.get_service()

                    if not (
                            s is None
                    ):  # check if service already exists to avoid adding duplicates
                        #print("            Found service {service} for port {port}".format(service=str(s.name),port=str(p.portId)))
                        #db_service = session.query(nmap_service).filter_by(name=s.name).filter_by(product=s.product).filter_by(version=s.version).filter_by(extrainfo=s.extrainfo).filter_by(fingerprint=s.fingerprint).first()
                        db_service = session.query(nmap_service).filter_by(
                            name=s.name).first()
                        if not db_service:
                            #print("Did not find service *********** name={0} prod={1} ver={2} extra={3} fing={4}".format(s.name, s.product, s.version, s.extrainfo, s.fingerprint))
                            db_service = nmap_service(s.name, s.product,
                                                      s.version, s.extrainfo,
                                                      s.fingerprint)
                            session.add(db_service)
                    # else:
                    #print("FOUND service *************** name={0}".format(db_service.name))

                    else:  # else, there is no service info to parse
                        db_service = None
                        # fetch the port
                    db_port = session.query(nmap_port).filter_by(
                        host_id=db_host.id).filter_by(
                            port_id=p.portId).filter_by(
                                protocol=p.protocol).first()

                    if not db_port:
                        #print("Did not find port *********** portid={0} proto={1}".format(p.portId, p.protocol))
                        if db_service:
                            db_port = nmap_port(p.portId, p.protocol, p.state,
                                                db_host.id, db_service.id)
                        else:
                            db_port = nmap_port(p.portId, p.protocol, p.state,
                                                db_host.id, '')
                        session.add(db_port)
                    #else:
                    #print('FOUND port *************** portid={0}'.format(db_port.port_id))
                    createPortsProgress = createPortsProgress + (
                        (100.0 / hostCount) / 5)
                    totalprogress = totalprogress + createPortsProgress
                    self.importProgressWidget.setProgress(totalprogress)
                    self.importProgressWidget.show()

            session.commit()

            #totalprogress += progress
            #self.tick.emit(int(totalprogress))

            for h in parser.all_hosts(
            ):  # create all script objects that need to be created

                db_host = session.query(nmap_host).filter_by(ip=h.ip).first()

                for p in h.all_ports():
                    for scr in p.get_scripts():
                        self.tsLog(
                            "        Processing script obj {scr}".format(
                                scr=str(scr)))
                        db_port = session.query(nmap_port).filter_by(
                            host_id=db_host.id).filter_by(
                                port_id=p.portId).filter_by(
                                    protocol=p.protocol).first()
                        db_script = session.query(nmap_script).filter_by(
                            script_id=scr.scriptId).filter_by(
                                port_id=db_port.id).first()
                        cveResults = scr.get_cves()
                        for cveEntry in cveResults:
                            t_cve = cve(name=cveEntry.name,
                                        url=cveEntry.url,
                                        source=cveEntry.source,
                                        severity=cveEntry.severity,
                                        product=cveEntry.product,
                                        version=cveEntry.version,
                                        hostId=db_host.id)
                            session.add(t_cve)

                        if not db_script:  # if this script object doesn't exist, create it
                            t_nmap_script = nmap_script(
                                scr.scriptId, scr.output, db_port.id,
                                db_host.id)
                            self.tsLog(
                                "        Adding nmap_script obj {script}".
                                format(script=scr.scriptId))
                            session.add(t_nmap_script)

                for hs in h.get_hostscripts():
                    db_script = session.query(nmap_script).filter_by(
                        script_id=hs.scriptId).filter_by(
                            host_id=db_host.id).first()
                    if not db_script:
                        t_nmap_script = nmap_script(hs.scriptId, hs.output,
                                                    None, db_host.id)
                        session.add(t_nmap_script)

            session.commit()

            for h in parser.all_hosts():  # update everything

                db_host = session.query(nmap_host).filter_by(ip=h.ip).first()

                if db_host.ipv4 == '' and not h.ipv4 == '':
                    db_host.ipv4 = h.ipv4
                if db_host.ipv6 == '' and not h.ipv6 == '':
                    db_host.ipv6 = h.ipv6
                if db_host.macaddr == '' and not h.macaddr == '':
                    db_host.macaddr = h.macaddr
                if not h.status == '':
                    db_host.status = h.status
                if db_host.hostname == '' and not h.hostname == '':
                    db_host.hostname = h.hostname
                if db_host.vendor == '' and not h.vendor == '':
                    db_host.vendor = h.vendor
                if db_host.uptime == '' and not h.uptime == '':
                    db_host.uptime = h.uptime
                if db_host.lastboot == '' and not h.lastboot == '':
                    db_host.lastboot = h.lastboot
                if db_host.distance == '' and not h.distance == '':
                    db_host.distance = h.distance
                if db_host.state == '' and not h.state == '':
                    db_host.state = h.state
                if db_host.count == '' and not h.count == '':
                    db_host.count = h.count

                session.add(db_host)

                tmp_name = ''
                tmp_accuracy = '0'  # TODO: check if better to convert to int for comparison

                os_nodes = h.get_OS()
                for os in os_nodes:
                    db_os = session.query(nmap_os).filter_by(
                        host_id=db_host.id).filter_by(name=os.name).filter_by(
                            family=os.family).filter_by(
                                generation=os.generation).filter_by(
                                    os_type=os.os_type).filter_by(
                                        vendor=os.vendor).first()

                    db_os.os_accuracy = os.accuracy  # update the accuracy

                    if not os.name == '':  # get the most accurate OS match/accuracy to store it in the host table for easier access
                        if os.accuracy > tmp_accuracy:
                            tmp_name = os.name
                            tmp_accuracy = os.accuracy

                if os_nodes:  # if there was operating system info to parse

                    if not tmp_name == '' and not tmp_accuracy == '0':  # update the current host with the most accurate OS match
                        db_host.os_match = tmp_name
                        db_host.os_accuracy = tmp_accuracy

                session.add(db_host)

                for p in h.all_ports():
                    s = p.get_service()
                    if not (s is None):
                        #db_service = session.query(nmap_service).filter_by(name=s.name).filter_by(product=s.product).filter_by(version=s.version).filter_by(extrainfo=s.extrainfo).filter_by(fingerprint=s.fingerprint).first()
                        db_service = session.query(nmap_service).filter_by(
                            name=s.name).first()
                    else:
                        db_service = None
                        # fetch the port
                    db_port = session.query(nmap_port).filter_by(
                        host_id=db_host.id).filter_by(
                            port_id=p.portId).filter_by(
                                protocol=p.protocol).first()
                    if db_port:
                        #print("************************ Found {0}".format(db_port))

                        if db_port.state != p.state:
                            db_port.state = p.state
                            session.add(db_port)

                        if not (
                                db_service is None
                        ) and db_port.service_id != db_service.id:  # if there is some new service information, update it
                            db_port.service_id = db_service.id
                            session.add(db_port)

                    for scr in p.get_scripts(
                    ):  # store the script results (note that existing script outputs are also kept)
                        db_script = session.query(nmap_script).filter_by(
                            script_id=scr.scriptId).filter_by(
                                port_id=db_port.id).first()

                        if not scr.output == '' and scr.output is not None:
                            db_script.output = scr.output

                        session.add(db_script)

            totalprogress = 100
            self.importProgressWidget.setProgress(int(totalprogress))
            self.importProgressWidget.show()

            session.commit()
            self.db.dbsemaphore.release()  # we are done with the DB
            self.tsLog('Finished in ' + str(time() - starttime) + ' seconds.')
            self.done.emit()
            self.importProgressWidget.hide()
            self.schedule.emit(
                parser, self.output == ''
            )  # call the scheduler (if there is no terminal output it means we imported nmap)

        except Exception as e:
            self.tsLog('Something went wrong when parsing the nmap file..')
            self.tsLog("Unexpected error: {0}".format(sys.exc_info()[0]))
            self.tsLog(e)
            raise
            self.done.emit()
 def __init__(self, filename):
     Parser.__init__(self, filename, ".deposit.csv")
Exemple #45
0
def identifyCorpus(corpus):
    corpus.update()
    clf = EmbeddingOracle.train(corpus)
    Parser.parse(corpus, clf)
    scores = Evaluation.evaluate(corpus)
    return scores
    def test_summary(self):
        p = Parser()
        res = p.parse(self.f, None, False)

        self.assertTrue(res['summary'] is not None)
        self.assertTrue(len(res['summary']) == 46)
Exemple #47
0
 def __init__(self, filename):
     Parser.__init__(self, filename, ".check.csv")
Exemple #48
0
    def test_index_timestamp_returns_timestamp(self):
        p = Parser()
        html = '<tr><td></td><td><u>123456</u></td></tr>'
        ts = p.index_timestamp(make_tree(html))

        self.assertEqual(ts, 123456)
Exemple #49
0
	def run(self):														# it is necessary to get the qprocess because we need to send it back to the scheduler when we're done importing
		try:
			print "[+] Parsing nmap xml file: " + self.filename
			starttime = time.time()
			
			try:
				parser = Parser(self.filename)
			except:
				print '\t[-] Giving up on import due to previous errors.'
				print "\t[-] Unexpected error:", sys.exc_info()[0]
				self.done.emit()
				return
				
			self.db.dbsemaphore.acquire()								# ensure that while this thread is running, no one else can write to the DB
			s = parser.get_session()									# nmap session info
			if s:
				nmap_session(self.filename, s.start_time, s.finish_time, s.nmap_version, s.scan_args, s.total_hosts, s.up_hosts, s.down_hosts)
			hostCount = len(parser.all_hosts())
			if hostCount==0:											# to fix a division by zero if we ran nmap on one host
				hostCount=1
			progress = 100.0 / hostCount
			totalprogress = 0
			self.tick.emit(int(totalprogress))
	
			for h in parser.all_hosts():								# create all the hosts that need to be created
				db_host = nmap_host.query.filter_by(ip=h.ip).first()
				
				if not db_host:											# if host doesn't exist in DB, create it first
					hid = nmap_host('', '', h.ip, h.ipv4, h.ipv6, h.macaddr, h.status, h.hostname, h.vendor, h.uptime, h.lastboot, h.distance, h.state, h.count)
					note(hid, '')

			session.commit()
			
			for h in parser.all_hosts():								# create all OS, service and port objects that need to be created

				db_host = nmap_host.query.filter_by(ip=h.ip).first()	# fetch the host
				
				os_nodes = h.get_OS()									# parse and store all the OS nodes
				for os in os_nodes:
					db_os = nmap_os.query.filter_by(host_id=db_host.id).filter_by(name=os.name).filter_by(family=os.family).filter_by(generation=os.generation).filter_by(os_type=os.os_type).filter_by(vendor=os.vendor).first()
					
					if not db_os:
						nmap_os(os.name, os.family, os.generation, os.os_type, os.vendor, os.accuracy, db_host)

				for p in h.all_ports():									# parse the ports
					s = p.get_service()

					if not (s is None):									# check if service already exists to avoid adding duplicates
						db_service = nmap_service.query.filter_by(name=s.name).filter_by(product=s.product).filter_by(version=s.version).filter_by(extrainfo=s.extrainfo).filter_by(fingerprint=s.fingerprint).first()
						
						if not db_service:
							db_service = nmap_service(s.name, s.product, s.version, s.extrainfo, s.fingerprint)

					else:												# else, there is no service info to parse
						db_service = None					
																		# fetch the port
					db_port = nmap_port.query.filter_by(host_id=db_host.id).filter_by(port_id=p.portId).filter_by(protocol=p.protocol).first()
					
					if not db_port:		
						db_port = nmap_port(p.portId, p.protocol, p.state, db_host, db_service)

			session.commit()
			
			totalprogress += progress
			self.tick.emit(int(totalprogress))

			for h in parser.all_hosts():								# create all script objects that need to be created
				
				db_host = nmap_host.query.filter_by(ip=h.ip).first()
				
				for p in h.all_ports():
					for scr in p.get_scripts():
												
						db_port = nmap_port.query.filter_by(host_id=db_host.id).filter_by(port_id=p.portId).filter_by(protocol=p.protocol).first()
						db_script = nmap_script.query.filter_by(script_id=scr.scriptId).filter_by(port_id=db_port.id).first()

						if not db_script:								# if this script object doesn't exist, create it
							nmap_script(scr.scriptId, scr.output, db_port, db_host)
					
				for hs in h.get_hostscripts():
					db_script = nmap_script.query.filter_by(script_id=hs.scriptId).filter_by(host_id=db_host.id).first()
					if not db_script:
						nmap_script(hs.scriptId, hs.output, None, db_host)					
					
			session.commit()
					
			for h in parser.all_hosts():								# update everything

				db_host = nmap_host.query.filter_by(ip=h.ip).first()	# get host from DB (if any with the same IP address)
				
				if db_host.ipv4 == '' and not h.ipv4 == '':
					db_host.ipv4 = h.ipv4
				if db_host.ipv6 == '' and not h.ipv6 == '':
					db_host.ipv6 = h.ipv6
				if db_host.macaddr == '' and not h.macaddr == '':
					db_host.macaddr = h.macaddr
				if not h.status == '':
					db_host.status = h.status
				if db_host.hostname == '' and not h.hostname == '':
					db_host.hostname = h.hostname
				if db_host.vendor == '' and not h.vendor == '':
					db_host.vendor = h.vendor
				if db_host.uptime == '' and not h.uptime == '':
					db_host.uptime = h.uptime
				if db_host.lastboot == '' and not h.lastboot == '':
					db_host.lastboot = h.lastboot
				if db_host.distance == '' and not h.distance == '':
					db_host.distance = h.distance
				if db_host.state == '' and not h.state == '':
					db_host.state = h.state
				if db_host.count == '' and not h.count == '':
					db_host.count = h.count
						
				tmp_name = ''
				tmp_accuracy = '0' 										# TODO: check if better to convert to int for comparison
				
				os_nodes = h.get_OS()
				for os in os_nodes:
					db_os = nmap_os.query.filter_by(host_id=db_host.id).filter_by(name=os.name).filter_by(family=os.family).filter_by(generation=os.generation).filter_by(os_type=os.os_type).filter_by(vendor=os.vendor).first()
					
					db_os.os_accuracy = os.accuracy						# update the accuracy
							
					if not os.name == '':								# get the most accurate OS match/accuracy to store it in the host table for easier access
						if os.accuracy > tmp_accuracy:
							tmp_name = os.name
							tmp_accuracy = os.accuracy

				if os_nodes:											# if there was operating system info to parse
					
					if not tmp_name == '' and not tmp_accuracy == '0':	# update the current host with the most accurate OS match
						db_host.os_match = tmp_name
						db_host.os_accuracy = tmp_accuracy
								
				for p in h.all_ports():		
					s = p.get_service()
					if not (s is None):
																		# fetch the service for this port
						db_service = nmap_service.query.filter_by(name=s.name).filter_by(product=s.product).filter_by(version=s.version).filter_by(extrainfo=s.extrainfo).filter_by(fingerprint=s.fingerprint).first()
					else:
						db_service = None						
																		# fetch the port
					db_port = nmap_port.query.filter_by(host_id=db_host.id).filter_by(port_id=p.portId).filter_by(protocol=p.protocol).first()					
					db_port.state = p.state
					
					if not (db_service is None):						# if there is some new service information, update it
						db_port.service_id = db_service.id
				
					for scr in p.get_scripts():							# store the script results (note that existing script outputs are also kept)	
						db_script = nmap_script.query.filter_by(script_id=scr.scriptId).filter_by(port_id=db_port.id).first()

						if not scr.output == '':
							db_script.output = scr.output
				
				totalprogress += progress
				self.tick.emit(int(totalprogress))		

			session.commit()
			self.db.dbsemaphore.release()								# we are done with the DB
			print '\t[+] Finished in '+ str(time.time()-starttime) + ' seconds.'
			self.done.emit()
			self.schedule.emit(parser, self.output == '')				# call the scheduler (if there is no terminal output it means we imported nmap)
			
		except:
			print '\t[-] Something went wrong when parsing the nmap file..'
			print "\t[-] Unexpected error:", sys.exc_info()[0]
			self.done.emit()
 def __init__(self, filename):
     Parser.__init__(self, filename, ".date.csv")
Exemple #51
0
    def test_index_nbytes_returns_nbytes(self):
        p = Parser()
        html = '<td class="tor-size"><u>123456</u></td>'
        nbytes = p.index_nbytes(make_tree(html))

        self.assertEqual(nbytes, 123456)
 def __init__(self, filename):
     Parser.__init__(self, filename, ".balance.csv")
Exemple #53
0
 def __init__(self, filename):
     Parser.__init__(self, filename, ".withdrawal.csv")
 def __init__(self, filename):
     Parser.__init__(self, filename, ".check.csv")
Exemple #55
0
def train(model, sess):
    with sess:
        summary_writers = model.init_summaries(sess)
        loss_dict = model.fit(sess, summary_writers)
    return loss_dict


def test(model, sess):
    with sess:
        loss_dict = model.run_eval(sess, 'test')
    return loss_dict


if __name__ == '__main__':
    args = Parser().get_parser().parse_args()
    config = Config(args)
    model, sess = init_model(config)

    if config.load == True:
        print("\033[92m=>\033[0m Testing Model")
        test_loss, test_metrics = train(model, sess)
        output = "=> Test Loss : {}".format(test_loss)
    else:
        print("\033[92m=>\033[0m Training Model")
        loss_dict = train(model, sess)
        test_metrics = loss_dict['test_metrics']
        output = "=> Best Train Loss : {}, Test Loss : {}".format(
            loss_dict["train_loss"], loss_dict["test_loss"])

    # output += "\n=> Test : Coverage = {}, Average Precision = {}, Micro Precision = {}, Micro Recall = {}, Micro F Score = {}".format(metrics['coverage'], metrics['average_precision'], metrics['micro_precision'], metrics['micro_recall'], metrics['micro_f1'])
    def test_main(self):
        p = Parser()
        res = p.parse(self.f, None, False)

        self.assertTrue(len(res) == 9)
 def __init__(self, filename):
     Parser.__init__(self, filename, ".transaction.csv")
Exemple #58
0
class Article(object):
	'''Article'''
	def __init__(self, url, raw_html, step, lang="en"):
		self.status = True
		self.url = url
		self.step = step
		self.lang = lang
		
		# title of the article
		self.title = None	
		#text
		self.article = u""
		self.cleaned_text = u""
		# meta
		self.meta_description = u""
		self.meta_lang = u""
		self.meta_favicon = u""
		self.meta_keywords = u""
		#link and domain
		self.canonical_link = u""
		self.domain = u""
		# cleaned text
		self.top_node = None
		self.tags = set()
		self.final_url = url
		self.raw_html = raw_html
		# the lxml Document object
		self.parser = Parser()
		self.raw_doc = u""
		self.publish_date = None
		self.additional_data = {}
		self.links = []
		self.outlinks = []
		self.inlinks = []
		self.start_date = datetime.datetime.today()
	
	def get(self):
		try:
			self.doc = self.parser.fromstring(self.raw_html)
			#init extractor method
			extractor = StandardContentExtractor(self,"en")	
			# init the document cleaner
			cleaner = StandardDocumentCleaner(self)
			# init the output formatter
			formatter = StandardOutputFormatter(self, stopwords_class="en")
			#doc
			#self.doc = doc
			self.raw_doc = deepcopy(self.raw_html)
			
			self.title = extractor.get_title()
			#self.title = self.title
			#meta
			self.meta_lang = extractor.get_meta_lang()
			#self.meta_favicon = extractor.get_favicon()
			self.meta_description = extractor.get_meta_description()
			self.meta_description = self.meta_description.decode("utf-8")
			self.meta_keywords = extractor.get_meta_keywords()
			
			#domain and url
			self.canonical_link = extractor.get_canonical_link()
			self.domain = extractor.get_domain()
			#~ 
			#~ #tag
			self.tags = extractor.extract_tags()
			#~ #text
			self.doc = cleaner.clean()
			
			self.top_node = extractor.calculate_best_node()
			if self.top_node is not None:
				# post cleanup
				self.top_node = extractor.post_cleanup(self.top_node)
				
			# clean_text
			#self.cleaned_text = formatter.get_formatted_text()
			
			
			#self.content = self.content.decode("utf-8")
			self.links = extractor.get_links()
			self.outlinks = [{"url":url, "step": self.step+1} for url in extractor.get_outlinks()]
			try:
				self.content = formatter.get_formatted_text()
				
			except Exception as e:
				try:
					self.content = bs(self.raw_html).getText()
					self.content = nltk.clean_html(self.content)
				except Exception as e:
					print e
					self.content  = re.sub(r'<.*?>', '', self.raw_html)
			#self.inlinks, self.inlinks_err = extractor.get_outlinks(self.links)
			# TODO
			# self.article.publish_date = self.extractor.get_pub_date(doc)
			# self.article.additional_data = self.extractor.more(doc)
			
			return True
			
		except Exception as e:
			
			self.status = {
				"url": self.url,
				"scope": "article extraction",
				"msg": e.args,
				"status": False,
				"code": -2
				}
			return False
				
	
	def repr(self):
		self.status ={
				"url": self.canonical_link,
				"domain": self.domain,
				"title": self.title.encode("utf-8"),
				"content": self.content,
				"description": self.meta_description.encode("utf-8"),
				"outlinks": self.outlinks,
				"crawl_date": self.start_date,
				"raw_html": self.raw_html,
				}
		return 
	
	def is_relevant(self, query):
		self.content = {"title":unicode(self.title), "content": unicode(self.content)}
		if query.match(self.content) is False:
			self.status = {"url":self.url, "code": -1, "msg": "Not Relevant","status": False, "title": self.title, "content": self.content}
			return False
		else:
			self.repr()
			return True