Beispiel #1
0
    def __init__(self, rulefname):
        """

        :param rulefname: 配置文件路径
        """
        self.__rulefd = None
        try:
            self.__rulefd = codecs.open(rulefname,
                                        mode='rb',
                                        encoding='utf8',
                                        errors='ignore')
        except Exception as e:
            logging.error("read %s file failed: %s" % (rulefname, repr(e)))

        dataMap = load(self.__rulefd)
        tcp_stream_handler = dataMap.get("tcp_stream_handler")
        self.bpf_filter = tcp_stream_handler.get("bpf_filter")
        self.dst_port_filter = tcp_stream_handler.get("dst_port_filter")
        self.dst_ip_filter = tcp_stream_handler.get("dst_ip_filter")
        self.pcap_file = tcp_stream_handler.get(
            "pcap_file") if tcp_stream_handler.get("pcap_file_enable",
                                                   0) == 1 else None

        if self.pcap_file:
            self.pcap_file = mills.path(self.pcap_file)

        self.device = tcp_stream_handler.get(
            "device") if tcp_stream_handler.get("device_enable",
                                                0) == 1 else None

        self.data_level = tcp_stream_handler.get("data_level", 1)
        self.data_stream_direct = tcp_stream_handler.get(
            "data_stream_direct", 2)
        self.std_output_enable = tcp_stream_handler.get("std_output_enable", 1)

        self.file_tcpsession_path = tcp_stream_handler.get(
            "file_tcpsession_path") if tcp_stream_handler.get(
                "file_output_enable", 0) == 1 else None

        if self.file_tcpsession_path:
            self.file_tcpsession_path = mills.path(self.file_tcpsession_path)

        self.protocol_parse_conf = tcp_stream_handler.get(
            "protocol_parse_conf")
Beispiel #2
0
def parse_all(fnames=None, renew=False, proxy=None):
    """
    批量解析页面
    :param fnames:
    :param renew 是否重新解析所有文件
    :return:
    """
    so = SQLiteOper("data/scrap.db")
    if renew:
        fnames = []
        fname_gen = glob.iglob(r'data/secwiki/*.html')
        sql = 'delete from `secwiki_detail`'
        for f in fname_gen:
            fnames.append(f)

        so.execute(sql)

    if fnames is None:
        print "no new secwiki"
        return

    nos = sort_fname(fnames)

    # sqlite handler
    sql = """insert into `secwiki_detail`(`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`path`)
                            values(?,?,?,?,?,?,?);"""

    # file handler

    result_fname = path("data/secwiki_{start}_{end}.txt".format(
        start=nos.keys()[0], end=nos.keys()[-1]))

    if not renew and os.path.isfile(
            result_fname) and os.path.getsize(result_fname) > 0:
        return

    result_fh = codecs.open(result_fname, mode='wb')
    for k in nos.keys():
        fname = nos[k]

        with open(fname, mode='r') as html_hd:
            results_list = {}
            for content in parse_item(html_hd, so=so, proxy=proxy):
                if content:
                    k = content[0] + content[2]

                    results_list[k] = content

                    line = "\t".join(content)
                    print line
                    result_fh.write("{line}{linesep}".format(
                        line=line, linesep=os.linesep))

            so.executemany(sql, operate_list=results_list.values())

    result_fh.close()
Beispiel #3
0
def scrap_item(cur_day=None):
    """

    :return:
    """
    year = cur_day[0:4]
    month = cur_day[4:6]
    day = cur_day[6:8]

    fname = path("data/xuanwu/{year}/{month}/{day}/index.html".format(
        year=year, month=month, day=day))

    url = """https://xuanwulab.github.io/cn/secnews/{year}/{month}/{day}/index.html""".format(
        year=year, month=month, day=day)
    print url
    logging.info("[SCRAP_PAGE]: %s" % url)
    try:

        r = requests.get(url)
        if r.status_code == 200:
            fname_year = path("data/xuanwu/{year}".format(year=year))

            if not os.path.exists(fname_year):
                os.mkdir(fname_year)
            fname_month = path("data/xuanwu/{year}/{month}".format(
                year=year, month=month))

            if not os.path.exists(fname_month):
                os.mkdir(fname_month)

            fname_day = path("data/xuanwu/{year}/{month}/{day}".format(
                year=year, month=month, day=day))

            if not os.path.exists(fname_day):
                os.mkdir(fname_day)

            with codecs.open(fname, mode='wb') as fw:
                fw.write(r.content)

                return fname

    except Exception as e:
        logging.error("[SCRAP_REQUEST_FAILED]: %s %s" % (url, str(e)))
Beispiel #4
0
def scrap_item(i=1):
    """
    爬取单个页面
    :return:
    """
    url = "https://www.sec-wiki.com/weekly/{i}".format(i=i)
    if not os.path.exists(path("data/secwiki")):
        os.mkdir(path("data/secwiki"))
    fname = path("data/secwiki/{i}_week.html".format(i=i))
    logging.info("[SCRAP_PAGE]: %s" % url)
    try:

        r = requests.get(url)
        if r.status_code == 200:
            with codecs.open(fname, mode='wb') as fw:
                fw.write(r.content)
                return fname

    except Exception as e:
        logging.error("[SCRAP_REQUEST_FAILED]: %s %s" % (url, str(e)))
Beispiel #5
0
def statistict_github_language(so, topn=132, reverse=True, year=''):
    """

    :param so:
    :return:
    """

    lang_dict = {}
    sql = "select distinct repo_lang from github where ts like '{year}%' and (repo_lang is not null or repo_lang != '')".format(
        year=year)
    # print sql
    result = so.query(sql)
    if result:
        for item in result:
            repo_lang = item[0]
            repo_langs = [_.strip() for _ in re.split(',', repo_lang)]
            for repo_lang in repo_langs:
                if not repo_lang:
                    continue
                if repo_lang in lang_dict:
                    lang_dict[repo_lang] = lang_dict[repo_lang] + 1
                else:
                    lang_dict[repo_lang] = 1

    vd = OrderedDict(
        sorted(lang_dict.items(), key=lambda t: t[1], reverse=reverse))
    sum_count = sum(vd.values())
    vd2 = OrderedDict()

    i = 0
    for k, v in vd.items():
        if i < topn:
            vd2[k] = round(float(v) / sum_count, 4)
        else:
            break
        i = i + 1
    fname = path("data", "%s_github_lang.txt" % year)
    with open(fname, mode='wb') as fw:
        for k, v in vd.items():
            fw.write("%s\t%s%s" % (k, v, os.linesep))

    return vd2
Beispiel #6
0
def draw_pie(so, source="secwiki", year="", tag="domain", top=10):
    """

    :return:
    """
    if tag != "language":

        ods = info_source(so,
                          table="{source}_detail".format(source=source),
                          top=top,
                          year=str(year),
                          tag=tag)
    else:
        ods = statistict_github_language(so, topn=top, year=year)

    labels = []
    values = []
    if not ods:
        return
    for k, v in ods.items():
        labels.append(k)
        values.append(v)

    labels.append("other")
    values.append(1 - sum(values))

    explode = [0.1 for _ in range(0, len(labels))]
    explode[-1] = 0  # 凸显

    try:
        #plt.rcParams['font.sans-serif'] = ['MicrosoftYaHei']
        plt.rcParams['font.sans-serif'] = ['SimHei']  # 解决中文乱码
        plt.rcParams['font.family'] = 'sans-serif'
        plt.rcParams['axes.unicode_minus'] = False  # 坐标轴负号的处理
        plt.axes(aspect='equal')  # 设置x,y轴刻度一致,这样饼图才能是圆的
        plt.pie(
            values,  # 指定绘图的数据
            explode=explode,  # 指定饼图某些部分的突出显示,即呈现爆炸式
            labels=labels,  # 为饼图添加标签说明,类似于图例说明
            labeldistance=1.2,  # 设置各扇形标签(图例)与圆心的距离;
            pctdistance=0.6,  # :设置百分比标签与圆心的距离;
            startangle=90,  # 设置饼图的初始摆放角度;
            shadow=True,  # 是否添加饼图的阴影效果;
            autopct='%3.2f%%')

        if tag == "domain":
            title_pie = "%s-信息源占比-%s" % (year, source)
        elif tag == "tag":
            title_pie = "%s-信息类型占比-%s" % (year, source)
        elif tag == "language":

            title_pie = "%s-最喜欢语言占比" % (year)

        else:
            return

        plt.title(unicode(title_pie))

        fdir = path("data/img/%s" % tag)
        if not os.path.exists(fdir):
            os.mkdir(fdir)
        fpath = path(fdir, "%s.png" % title_pie)

        plt.legend(labels, loc='upper right', fontsize=5)

        plt.savefig(fpath)

        plt.close()
    except Exception as e:
        print source, year, tag
        print len(labels), labels
        print len(values), values
        print len(explode), explode
Beispiel #7
0
def parse_all(renew=False, ndays=None, proxy=None):
    """
    解析多个页面
    :return:
    """
    so = SQLiteOper("data/scrap.db")

    # 解析或爬取缺失的页面
    fname_lists = []
    if ndays is not None:

        for cur_day in ndays:
            year = cur_day[0:4]
            month = cur_day[4:6]
            day = cur_day[6:8]
            fname = path("data/xuanwu/{year}/{month}/{day}/index.html".format(
                year=year, month=month, day=day))

            if not os.path.exists(fname):

                fname = scrap_item(cur_day)
                if fname is None:
                    print "%s news not exits" % cur_day

                else:
                    fname_lists.append(fname)

    if renew:
        fname_lists = []
        # 重新解析所有页面
        sql = 'delete from `xuanwu_detail`'
        so.execute(sql)
        for fname in glob.iglob(r'data/xuanwu/*/*/*/index.html'):
            fname_lists.append(fname)

    if fname_lists:
        start, end = getstartendfrompath(fname_lists)
        sql = """
                    insert into `xuanwu_detail`(`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`path`,`author_id`)
                        values(?,?,?,?,?,?,?,?);
                    """
        # file handler
        result_fname = path("data/xuanwu_{start}_{end}.txt".format(start=start,
                                                                   end=end))

        if not renew and os.path.isfile(
                result_fname) and os.path.getsize(result_fname) > 0:
            return

        result_fh = codecs.open(result_fname, mode='wb')

        for fname in fname_lists:

            fname = path(fname)

            results_list = {}
            for content in parse_item(fname, so=so, proxy=proxy):
                if content:
                    k = content[0] + content[2]

                    results_list[k] = content
                    line = "\t".join(content)
                    print line
                    result_fh.write("{line}{linesep}".format(
                        line=line, linesep=os.linesep))

            if results_list:
                so.executemany(sql, operate_list=results_list.values())
Beispiel #8
0
    else:
        cmd = cmd_darwin
    local_ip = subprocess.Popen([cmd], stdout=subprocess.PIPE, shell=True)
    (IP, errors) = local_ip.communicate()
    local_ip.stdout.close()
    IP = IP.strip()
    return IP


if __name__ == "__main__":
    from optparse import OptionParser
    import logger

    logger.generate_special_logger(level=logging.INFO,
                                   logtype="network",
                                   curdir=mills.path("log/"),
                                   ismultiprocess=False)
    parser = OptionParser()

    parser.add_option(
        "--portHost",
        dest='getPortHostByteOrder',
        action="store",
        type="int",
        help=
        "change network byte order port to host byte order port(20480 - 80)",
        # default=20480
    )
    parser.add_option(
        "--portNetwork",
        dest='getPortNetworkByteOrder',