def test_get_python_from_file():
    rows = file_util.readRows(
        "/home/changshuai/Temp/5dcdd0a7-8842-4695-8fd8-9553b191d346.tmp")
    data = []
    for row in rows:
        datum = json_util.to_python(row)
        if datum["jobName"] == "yili_ecomm_v2_20190605131709_003_27":
            data.append(datum)
    print(data)
Beispiel #2
0
def getAccordance(
        file_name1: str,
        file_name2: str,
        base_dir="/home/changshuai/PycharmProjects/dio_core/Test/Data/{}"):
    """
    获取准确率
    :param base_dir:
    :param file_name1: 浏览器数据
    :param file_name2: 爬虫数据
    :return:
    """
    file_name1 = file_name1 if file_name1.startswith("/") else base_dir.format(
        file_name1)
    file_name2 = file_name2 if file_name2.startswith("/") else base_dir.format(
        file_name2)
    row1 = set(file_util.readRows(file_name1))
    row2 = set(file_util.readRows(file_name2))

    print("交集: {}".format(len(row1 & row2)))
    print(len(row1 & row2) / len(row1))
Beispiel #3
0
def compareUrls(
        file_name1,
        file_name2,
        base_dir="/home/changshuai/PycharmProjects/dio_core/Test/Data/{}"):
    """
    比较Url
    :param base_dir:
    :param file_name1: url集1
    :param file_name2: url集2
    :return:
    """
    file_name1 = file_name1 if file_name1.startswith("/") else base_dir.format(
        file_name1)
    file_name2 = file_name2 if file_name2.startswith("/") else base_dir.format(
        file_name2)

    row1 = set(file_util.readRows(file_name1))
    row2 = set(file_util.readRows(file_name2))

    print("交集1: {}".format(len(row1)))
    print("交集2: {}".format(len(row2)))
    print("交集: {}".format(len(row1 & row2)))

    print(len(row1 & row2) / len(row1 | row2))
Beispiel #4
0
from dio_core.network.downloader import Downloader
from dio_core.utils import file_util, time_util

rows = file_util.readRows(
    "/home/changshuai/PycharmProjects/dio_core/Test/Data/kill_job_urls.txt")
for row in rows:
    url = "http://api.rhino.datatub.com/common/job/kill?job_id={}&token=5fa92f2a597cc60201780504be1028a7".format(
        row)
    res = Downloader.get(url)
    print(row, res.text, url)
    time_util.sleep(3)
Beispiel #5
0
import time

from dio_core.network.downloader.downloader import Downloader
from dio_core.utils import file_util

rows = list(
    file_util.readRows(
        "/home/changshuai/PycharmProjects/dio_core/dio_core_test/Data/cli_kill_job_list.txt"
    ))
# rows = ["app_ecomm_all_20190802141514_042_83", "app_ecomm_all_20190802141516_952_91"]
host = "datatub5:20425"
# host = "api.rhino.datatub.com"
for row in rows:
    try:
        res = Downloader.get(
            "http://{}/common/job/kill?job_id={}&token=5fa92f2a597cc60201780504be1028a7"
            .format(host, row))
        print(res.json())

    except Exception as e:
        print(e)

    time.sleep(1)
Beispiel #6
0
from dio_core.utils import file_util

rows = file_util.readRows("/home/changshuai/PycharmProjects/dio_core/dio_core_test/Data/cp_3539-changshuai_20190722.data")

lastWord = ""
curWord = ""

for ind, row in enumerate(rows):
    if ind % 2 == 0:
        lastWord = row
        continue
    else:
        curWord = row
        if curWord != lastWord:
            print(curWord, lastWord)
Beispiel #7
0
import time
import traceback

from dio_core.network.downloader.downloader import Downloader
from dio_core.network.downloader.downloader import Setting
from dio_core.utils import file_util, parse_util, json_util
from dio_core_test.utils import text_util

setting = Setting()
setting.headers["Cookie"] = ("DSCKID=91aecd4c-9d62-49ad-ae1b-9eb177c787ac; JSESSIONID=5AD6666BE97FEC415491055AFAFA60FE;"
                             " seraph.rememberme.cookie=13124%3A5ad60cddb478faeca22570e7f156f07e5138011a; atlassian.xsr"
                             "f.token=BP2B-R8C4-N6CQ-HZD4_9ab65e787a0932dd3b1abcc52a792e40d154415c_lin; jira.editor.use"
                             "r.mode=wysiwyg")
setting.htmlParse = True

rows = list(file_util.readRows("/home/changshuai/PycharmProjects/dio_core/dio_core_test/Data/JIRA_LIST.txt"))

allMsg = []

for url in rows:

    try:
        res = Downloader.get(url, setting)
        repeatsText = text_util.get_first_match(res.text, "WRM._unparsedData\[\"activity-panel-pipe-id\"\]=\"(.*)\";")
        repeats = repeatsText.encode("utf-8").decode("unicode-escape").encode("utf-8").decode("unicode-escape").replace("\\/", "/")
        soup = parse_util.get_bs4_soup(repeats.strip("\""))

        [_.extract() for _ in res.soup.select_one("#description-val").select(".user-hover")]
        msgInfo = {
            "title": res.soup.select_one("#summary-val").text.strip(),
            "id": text_util.get_first_match(url, "/(CP-\d+)").strip(),
Beispiel #8
0
from ftplib import FTP

from dio_core.utils import file_util

ftp = FTP()
ftp.set_debuglevel(2)
ftp.connect("120.31.140.156", 21)
ftp.login("datastory", "datastoryFtp@2016")

ftp.cwd("/temp/2012-06-06")
files = ftp.nlst()

rows = file_util.readRows(
    "/home/changshuai/Back/PythonProject/PyWork/data/ftp_file_path.txt")

i = 0
r = 0
for row in rows:
    if row not in files:
        i += 1
        print(row, i, r)
    r += 1
print(i)