def test_ftp(directory):
    path, port = directory
    ftp = FTP(path,
              host="127.0.0.1",
              listen_host="127.0.0.1",
              listen_port=port)

    assert (ftp.uri == "ftp://%s:%s/" % ("127.0.0.1", port))

    # Test that service start
    ftp.start()

    with pytest.raises(OPVDMException):
        ftp.start()

    # Check service is running
    assert (ftp.is_running())

    # Connect to server

    requests_ftp.monkeypatch_session()
    s = requests.Session()

    r = s.retr("%s%s" % (ftp.uri, "toto.txt"))

    assert (r.status_code == 226)
    assert (r.text == CONTENT_TEST)

    # Stop http test service
    ftp.stop()

    assert (ftp.is_running() is False)
Ejemplo n.º 2
0
def ftp_fetch(context, data):
    url = data.get('url')
    context.log.info("FTP fetch: %s", url)
    requests_ftp.monkeypatch_session()
    session = requests.Session()
    username = context.get('username', 'Anonymous')
    password = context.get('password', 'anonymous@ftp')

    cached = context.get_tag(url)
    if cached is not None:
        context.emit(rule='pass', data=cached)
        return

    resp = session.retr(url, auth=(username, password))
    if resp.status_code < 399:
        data.update({
            'status_code': resp.status_code,
            'retrieved_at': datetime.utcnow().isoformat(),
            'content_hash': context.store_data(data=resp.content)
        })
        context.set_tag(url, data)
        context.emit(rule='pass', data=data)
    else:
        resp = session.nlst(url, auth=(username, password))
        for child in resp.iter_lines():
            child_data = data.copy()
            child_data['url'] = os.path.join(url, child)
            # context.log.info("FTP directory child: %(url)s", child_data)
            context.emit(rule='child', data=child_data)
Ejemplo n.º 3
0
def make_http_request(url, **kwargs):
    """
    Makes http request using requests library
    :param url: URL to query
    :return: request Object
    """
    if 'requests_session' in kwargs:
        s = kwargs['requests_session']
    else:
        s = requests

    r = []
    try:
        r = s.get(url)  #requests.get(url)
        # determine if require request.post to set cookies
        if 'set_cookies' in kwargs:
            if kwargs['set_cookies'] == 'yes':
                cookies = dict(r.cookies)
                r = s.post(url, verify=True, cookies=cookies)
    except requests.exceptions.InvalidSchema:  # if url is ftp rather than http
        requests_ftp.monkeypatch_session()
        r = requests.Session().get(url)
    except requests.exceptions.ConnectionError:
        log.error("URL Connection Error for %s", url)
    try:
        r.raise_for_status()
    except requests.exceptions.HTTPError:
        log.error('Error in URL request!')
    return r
Ejemplo n.º 4
0
def fetch_url(url: str) -> bytes:
    logger = logging.getLogger(__name__)

    if url.startswith("http"):
        logger.debug("fetching url (http) \"%s\"", url)
        response = requests.get(url)
    elif url.startswith("ftp"):
        logger.debug("fetching url (ftp) \"%s\"", url)
        requests_ftp.monkeypatch_session()
        s = requests.Session()
        response = s.get(url)
        s.close()
    else:
        logger.exception("invalid url \"%s\"", url)
        raise requests.HTTPError("invalid url \"%s\"", url)

    if response.ok:
        # fix for url's that return 200 instead of a 404
        if int(response.headers.get("Content-Length", 0)) < 1000:
            logger.exception("Content-Length is very small"
                             "(url is most likely not a valid file)")
            raise requests.HTTPError("Content-Length is very small"
                                     "(url is most likely not a valid file)")

        return response.content
    else:
        logger.exception("failed to fetch url \"%s\" (%i)", url,
                         response.status_code)
        raise requests.HTTPError("failed to fetch url \"%s\" (%i)", url,
                                 response.status_code)
Ejemplo n.º 5
0
def download_data(file_name):
    requests_ftp.monkeypatch_session()
    resp = requests.Session().list('ftp://ftp.dd-wrt.com/betas/2020/')
    file_path = open(file_name, 'w')
    file_path.write((resp.content).decode('utf-8'))
    file_path.close()
    if (file_name == new_path):
        compare_files()
Ejemplo n.º 6
0
def scars(r0):
    try:
        requests_ftp.monkeypatch_session()
        s = requests.Session()
        # really f*****g slow?
        r1 = s.get(r0)
        s.close()
        return r1.content
    except:
        return
    return
Ejemplo n.º 7
0
def ftp():
    proxies = {'https': p_list[random.randint(0, len(p_list) - 1)]}
    print(proxies)
    requests_ftp.monkeypatch_session()
    with requests.Session() as s:
        s.proxies.update(proxies)
        # r = s.get(r'http://jsonip.com', headers=headers)
        # ip= r.json()['ip']
        # print('Your IP is', ip)
        resp = s.list('ftp://90.130.70.73/', auth=('anonymous', 'anonymous'))
        print(resp)
 def download_a_file(self, file_name, file_address):
     """Download a specific file"""
     requests_ftp.monkeypatch_session()
     session_requests = requests.Session()
     try:
         res = session_requests.get(file_address, stream=True)
         with open(file_name, 'wb') as output:
             shutil.copyfileobj(res.raw, output)
         session_requests.close()
     except Exception as e:
         logging.exception('Failed to download {}.'.format(file_name))
         return False
     return True
Ejemplo n.º 9
0
    def __init__(self, url, port=21, remote=None, filename=None, local='.', user='******', password='******'):
        super(self.__class__, self).__init__(url, port, remote, filename, local, user, password)

        if self.port is None:
            self.port = 21
        if self.user is None:
            self.user = '******'
        if self.password is None:
            self.password = '******'

        #self.ftp = ftplib.FTP()
        requests_ftp.monkeypatch_session()
        self.s = requests.Session()
Ejemplo n.º 10
0
    def latest(self):
        """Download latest available copy of clinvar database in vcf format."""
        import requests
        import requests_ftp
        import gzip

        requests_ftp.monkeypatch_session()
        with requests.Session() as sess:
            resp = sess.get('{}/clinvar.vcf.gz'.format(self.base_url))

        self.rawdata = gzip.decompress(resp.content)

        return self.rawdata
Ejemplo n.º 11
0
def confirm_urls(request):
    if request.method == 'POST':
        confirm_result = {'R1': False, 'R2':False, 'Rf': False, 'Lr': False, 'R1_err':'', 'R2_err':'', 'Rf_err':'', 'Lr_err':''}
        for key, val in request.POST.items():
            is_status_ok = False
            is_format_ok = False
            
            if key[:3] == 'url': # e.g., key = url_R1
                # m1 = re.search('google\.com.+id=(.+)\&*', val) ## Retired
                m1 = re.search('google\.com\/file\/d\/(.+)\/', val)
                if m1:
                    # use GoogleDriveDownloader module
                    id = m1.group(1)
                    response = GoogleDriveDownloader.get_response(id)
                else:
                    # direct download
                    if not re.match(r'^(http|https|ftp)://', val):
                        val = 'http://'+val
                    requests_ftp.monkeypatch_session()
                    session = requests.Session()
                    response = session.get(val, stream=True)
                    
                # Check file existing
                is_status_ok = response.ok
                if is_status_ok == False:
                    confirm_result[key[-2:]+'_err'] = 'File Not Found'
                else:
                    # Check file format
                    if m1:
                        m2 = re.search('filename="(.+)"', response.headers['Content-Disposition'])
                        file_name = m2.group(1)
                    else:
                        file_name = response.url.split('/')[-1]
                    
                    if key[-2:] == 'Rf':
                        if re.search('\.(fasta|fa|fna)+(\.gz)*$', file_name):
                            is_format_ok = True
                        else:
                            is_format_ok = False
                            confirm_result[key[-2:]+'_err'] = 'Unkown File Format'
                    else: #R1/R2/Lr
                        if re.search('\.f(ast)*q(\.gz)*$', file_name):
                            is_format_ok = True
                        else:
                            is_format_ok = False
                            confirm_result[key[-2:]+'_err'] = 'Unkown File Format'
                        
                confirm_result[key[-2:]] = is_status_ok and is_format_ok

        return HttpResponse(json.dumps(confirm_result))
Ejemplo n.º 12
0
def make_http_request(url):
    r = []
    try:
        r = requests.get(url)
    except requests.exceptions.InvalidSchema:  # if url is ftp rather than http
        requests_ftp.monkeypatch_session()
        r = requests.Session().get(url)
    except requests.exceptions.ConnectionError:
        log.error("URL Connection Error for " + url)
    try:
        r.raise_for_status()
    except requests.exceptions.HTTPError:
        log.error('Error in URL request!')
    return r
    def _crawl_urls_ftp(self, url, provider):
        """
        Check if a file is present on an FTP server and return the appropriate
        status code.
        """
        # We need to be able to mock this for testing and requests-mock doesn't
        # work with requests-ftp, so this is our workaround. We'll just bypass
        # this method like so (the real method returns either an int or None):
        test_ftp_status = self.source_config.get('test_ftp_status')
        if test_ftp_status == 'ok':
            return 10000
        elif test_ftp_status == 'error':
            return None

        # And now here's the real method:
        timeout = self.source_config['timeout']
        username = self.source_config['username']
        password = self.source_config['password']

        # Make a request to the website
        timestamp = str(datetime.utcnow())
        log_message = '{:<12} | {} | {} | {}s'
        try:
            requests_ftp.monkeypatch_session()
            s = requests.Session()
            r = s.size(url,
                       auth=HTTPBasicAuth(username, password),
                       timeout=timeout)
            status_code = r.status_code
            elapsed = r.elapsed.total_seconds()
        except (ConnectTimeout, ReadTimeout) as e:
            self._save_gather_error('Request timed out: {}'.format(e),
                                    self.job)  # noqa: E501
            status_code = 408
            elapsed = 9999

        if status_code == 213:
            size = int(r.text)
        else:
            size = None

        if status_code not in {213, 408}:
            self._save_gather_error('{} error: {}'.format(status_code, r.text),
                                    self.job)

        self.provider_logger.info(
            log_message.format(provider, timestamp, status_code, elapsed))
        return size
Ejemplo n.º 14
0
 def get_cds_url(self):
     """cds"""
     TOPLEVEL = re.compile('{0}.*?.cds.all.fa.gz'.format(self.species.capitalize()))
     url_1st = "ftp://ftp.ensemblgenomes.org/"
     url_2nd = "ftp://ftp.ensemblgenomes.org/pub/plants/release-45/fasta/"
     url_3rd_page = url_2nd + self.species.lower() + "/cds/"
     requests_ftp.monkeypatch_session()
     s = requests.Session()
     url_page = s.list(
         "ftp://ftp.ensemblgenomes.org/pub/plants/release-45/fasta/{0}/cds".format(self.species.lower()))
     url_page.encoding = 'utf-8'
     download_url = re.findall(TOPLEVEL, url_page.text)
     download_url = "".join(download_url)
     url = url_3rd_page + download_url
     print(url)
     return url
Ejemplo n.º 15
0
def download_cat(data_path, ebi_download):
    """ download the data from the ebi main site and ftp"""
    try:
        r = requests.get(ebi_download + 'studies_alternative')
        if r.status_code == 200:
            catstud_name = r.headers['Content-Disposition'].split('=')[1]
            with open(
                    os.path.join(data_path, 'catalog', 'raw', 'Cat_Stud.tsv'),
                    'wb') as tsvfile:
                tsvfile.write(r.content)
            diversity_logger.info('Successfully downloaded ' + catstud_name)
        else:
            diversity_logger.debug('Problem downloading the Cat_Stud file...')
        r = requests.get(ebi_download + 'ancestry')
        if r.status_code == 200:
            catanc_name = r.headers['Content-Disposition'].split('=')[1]
            with open(os.path.join(data_path, 'catalog', 'raw', 'Cat_Anc.tsv'),
                      'wb') as tsvfile:
                tsvfile.write(r.content)
            diversity_logger.info('Successfully downloaded ' + catanc_name)
        else:
            diversity_logger.debug('Problem downloading the Cat_Anc file...')
        r = requests.get(ebi_download + 'full')
        if r.status_code == 200:
            catfull_name = r.headers['Content-Disposition'].split('=')[1]
            with open(
                    os.path.join(data_path, 'catalog', 'raw', 'Cat_Full.tsv'),
                    'wb') as tsvfile:
                tsvfile.write(r.content)
            diversity_logger.info('Successfully downloaded ' + catfull_name)
        else:
            diversity_logger.debug('Problem downloading the Cat_full file...')
        requests_ftp.monkeypatch_session()
        s = requests.Session()
        ftpsite = 'ftp://ftp.ebi.ac.uk/'
        subdom = '/pub/databases/gwas/releases/latest/'
        file = 'gwas-efo-trait-mappings.tsv'
        r = s.get(ftpsite + subdom + file)
        if r.status_code == 200:
            with open(os.path.join(data_path, 'catalog', 'raw', 'Cat_Map.tsv'),
                      'wb') as tsvfile:
                tsvfile.write(r.content)
            diversity_logger.info('Successfully downloaded efo-trait-mapping!')
        else:
            diversity_logger.debug('Problem downloading efo-trait-mappings...')
    except Exception as e:
        diversity_logger.debug('Problem downloading Catalog data!' + str(e))
Ejemplo n.º 16
0
 def get_gff3_url(self):
     """gff"""
     TOPLEVEL = re.compile('{0}.*?.gff3.gz'.format(self.species.capitalize()))
     url_1st = "ftp://ftp.ensemblgenomes.org/"
     url_2nd = "ftp://ftp.ensemblgenomes.org/pub/plants/release-45/gff3/"
     url_3rd_page = url_2nd + self.species.lower()
     requests_ftp.monkeypatch_session()
     s = requests.Session()
     url_page = s.list(
         "ftp://ftp.ensemblgenomes.org/pub/plants/release-45/gff3/{0}".format(self.species.lower()))
     url_page.encoding = 'utf-8'
     download_url = re.findall(TOPLEVEL, url_page.text)[-1]
     print (download_url)
     download_url = "".join(download_url)
     url = url_3rd_page + "/" + download_url
     print(url)
     return url
def createForecastCSV(folderEntry):
    if folderEntry.get() != '' and os.path.exists(os.path.dirname(folderEntry.get())):
        dateTimeObj = datetime.now()
        dateTimeObj = dateTimeObj - timedelta(hours=int(dateTimeObj.strftime("%H")))
        dayString = dateTimeObj.strftime("%d")
        monthString = dateTimeObj.strftime("%m")
        yearString = dateTimeObj.strftime("%Y")
        url = 'http://ftp1.cptec.inpe.br/modelos/tempo/WRF/ams_05km/recortes/grh/json/' + yearString + '/' + monthString + '/' + dayString + '/00/225.json'
        requests_ftp.monkeypatch_session()
        response = requests.get(url)
        print(response)
        data = response.text
        print(data)
        weather = json.loads(data)

        hora = int(dateTimeObj.strftime("%H"))
        print(str(hora))
        print(str(dateTimeObj))
        timestampStr = dateTimeObj.strftime("%d%b%Y %H")
        
        print('Current Timestamp : ', timestampStr)

        fileOutput = folderEntry.get()+'/forecast.csv'
        outputFile = open(fileOutput, 'w') #load csv file
        with open(fileOutput, 'w', newline='') as outputFile:
            output = csv.writer(outputFile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            datasets = weather["datasets"][0]
            data = datasets["data"] #load json content
            outputFile.write("B,,PROSA\n")
            outputFile.write("C,UTC,PRECIP-INC\n")
            outputFile.write("E,,1HOUR\n")
            outputFile.write("F,,OBS\n")
            outputFile.write("Units,,MM\n")
            outputFile.write("Type,,PER-CUM\n")

            for i,row in enumerate(data):
                print(str(hora + i))
                outputFile.write(str(i+1) + "," + timestampStr + "00" +', '+ str(row["prec"]))
                outputFile.write('\n')
                dateTimeObj = dateTimeObj + timedelta(hours=1)
                timestampStr = dateTimeObj.strftime("%d%b%Y %H")

    elif folderEntry.get() == '':
        messagebox.showinfo('Error', 'Please Select the Destination Folder!')
    elif not os.path.exists(os.path.dirname(folderEntry.get())):
        messagebox.showinfo('Error', 'Destination Folder Doesn\'t Exist!')
Ejemplo n.º 18
0
def download_cat(path, ebi_download):
    """ download the data from the ebi main site and ftp"""
    r = requests.get(ebi_download + 'studies_alternative')
    with open(os.path.join(path, 'Cat_Stud.tsv'), 'wb') as tsvfile:
        tsvfile.write(r.content)
    r = requests.get(ebi_download + 'ancestry')
    with open(os.path.join(path, 'Cat_Anc.tsv'), 'wb') as tsvfile:
        tsvfile.write(r.content)
    r = requests.get(ebi_download + 'full')
    with open(os.path.join(path, 'Cat_Full.tsv'), 'wb') as tsvfile:
        tsvfile.write(r.content)
    requests_ftp.monkeypatch_session()
    s = requests.Session()
    ftpsite = 'ftp://ftp.ebi.ac.uk/'
    subdom = '/pub/databases/gwas/releases/latest/'
    file = 'gwas-efo-trait-mappings.tsv'
    r = s.get(ftpsite + subdom + file)
    with open(os.path.join(path, 'Cat_Map.tsv'), 'wb') as tsvfile:
        tsvfile.write(r.content)
Ejemplo n.º 19
0
def getftphtmlcontent(url):
    try:
        requests_ftp.monkeypatch_session()
        s = requests.Session()
        with s.list(url) as r:
            print(datetime.datetime.now(), '网络返回code:', r.status_code)
            if r.status_code == 226:
                content = r.content.decode(encoding='utf-8')
                print(datetime.datetime.now(), "content ", content)
                with open('requeste_data.txt', 'a', encoding='utf-8') as f:
                    data = json.dumps(dict(url=url, content=content)).strip()
                    f.write(data + ',')
                if content == None:
                    content = ''
                return content
            else:
                return None
    except Exception as e:
        logging.exception(e)
        print('erro:', e)
        return None
Ejemplo n.º 20
0
def download():
	#url = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA_000006665.1_ASM666v1/"
	frame = pd.read_csv("type_vi_organisms.csv")
	requests_ftp.monkeypatch_session()
	s = requests.Session()

	for index, row in frame.iterrows():
		if index < 150:
			continue
		response = s.list(row.ncbi)
		for line in response.text.split("\r\n"):
			if line:
				for entry in line.split(" "):
					if entry.endswith(".gbff.gz"):
						result = s.get(row.ncbi+"/"+entry)
	    #with gzip.open('/home/joe/file.txt.gz', 'wb') as f_out:
	    #    shutil.copyfileobj(f_in, f_out)					
						with open("C:\\Users\\ag3r\\Downloads\\genomes\\"+entry, "wb") as output:
							output.write(result.content)
						print("%s organism %s downloaded" %(index,row.organism))
						sleep(5)
Ejemplo n.º 21
0
def url_to_local_path(url, path, rename=None):
    """
    Copies a file from an http url to a local destination provided in path.
    Performs file-to-folder converstion
    :param url:
    :param path:
    :return:
    """
    if isdir(path) and '.zip' not in url and '.tar' not in url:
        new_path = join(path, url.split('/')[-1])

        if rename is not None:
            new_path = join(path, rename)

    if not url[:3] == 'ftp':

        r = requests.get(url, stream=True)

    else:
        # print 'debug firing'
        requests_ftp.monkeypatch_session()
        s = requests.Session()
        r = s.get(url)
        # print r.status_code
        # print r.content

    if r.status_code in ['226', 200, 226, '200']:
        if not url[:3] == 'ftp':
            with open(new_path, 'wb') as f:
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)
        else:
            with open(new_path, 'wb') as f:
                f.write(r.content)

    else:
        print r.status_code
        raise Exception(
            "Something is wrong with the url provided: %s.\n Please attempt downloading files manually" %
            url)
Ejemplo n.º 22
0
    def read_file(url):
        if not os.path.exists('./data'):
            os.makedirs('./data')

        file_name = url.split('/')[-1]
        file_path = './data/{}'.format(file_name)

        if not read_local or not os.path.isfile(file_path):
            requests_ftp.monkeypatch_session()
            s = requests.Session()

            if url.startswith('ftp://'):
                reply = s.retr(url, stream=True)
            else:
                reply = s.get(url, stream=True)

            with open(file_path, 'wb') as f:
                for chunk in reply.iter_content(chunk_size=2048):
                    if chunk:
                        f.write(chunk)
                        f.flush()

        if file_name.endswith('.gz'):
            f = gzip.open(file_path, 'rt')
        else:
            f = open(file_path, 'rt')

        cnt = 0
        while f:

            line = f.readline()
            if line is None or line == '':
                break

            cnt += 1
            if cnt % 100000 == 0:
                print('count: ', cnt)

            yield line
Ejemplo n.º 23
0
def ftp_fetch(context, data):
    url = data.get("url")
    context.log.info("FTP fetch: %s", url)
    requests_ftp.monkeypatch_session()
    session = requests.Session()
    username = context.get("username", "Anonymous")
    password = context.get("password", "anonymous@ftp")

    resource = urlparse(url).netloc or url
    # a bit weird to have a http rate limit while using ftp
    limit = context.get("http_rate_limit", settings.HTTP_RATE_LIMIT)
    limit = limit / 60  # per minute to per second for stricter enforcement
    rate_limit = get_rate_limit(resource, limit=limit, interval=1, unit=1)

    cached = context.get_tag(url)
    if cached is not None:
        context.emit(rule="pass", data=cached)
        return

    context.enforce_rate_limit(rate_limit)
    resp = session.retr(url, auth=(username, password))
    if resp.status_code < 399:
        data.update(
            {
                "status_code": resp.status_code,
                "retrieved_at": datetime.utcnow().isoformat(),
                "content_hash": context.store_data(data=resp.content),
            }
        )
        context.set_tag(url, data)
        context.emit(rule="pass", data=data)
    else:
        context.enforce_rate_limit(rate_limit)
        resp = session.nlst(url, auth=(username, password))
        for child in resp.iter_lines(decode_unicode=True):
            child_data = data.copy()
            child_data["url"] = os.path.join(url, child)
            context.log.info("FTP directory child: %(url)s", child_data)
            context.emit(rule="child", data=child_data)
Ejemplo n.º 24
0
    def read_file(url):
        if not os.path.exists('./data'):
            os.makedirs('./data')

        file_name = url.split('/')[-1]
        file_path = './data/{}'.format(file_name)

        if not read_local or not os.path.isfile(file_path):
            requests_ftp.monkeypatch_session()
            s = requests.Session()

            if url.startswith('ftp://'):
                reply = s.retr(url, stream=True)
            else:
                reply = s.get(url, stream=True)

            with open(file_path, 'wb') as f:
                for chunk in reply.iter_content(chunk_size=2048):
                    if chunk:
                        f.write(chunk)
                        f.flush()

        if file_name.endswith('.gz'):
            f = gzip.open(file_path, 'rt')
        else:
            f = open(file_path, 'rt')

        cnt = 0
        while f:

            line = f.readline()
            if line is None or line == '':
                break

            cnt += 1
            if cnt % 100000 == 0:
                print('count: ', cnt)

            yield line
Ejemplo n.º 25
0
def url_to_local_p_gz(url, path):
    """
    Copies a file from an http or ftp url to a local destination provided in path
    :param url:
    :param path:
    :return:
    """
    if url[:3] == 'ftp':
        requests_ftp.monkeypatch_session()
        s = requests.Session()
        r = s.retr(url)
    else:
        r = requests.get(url, stream=True)
    if r.status_code in ['226', 200, 226, '200']:
        r.raw.decode_content = True
        f_out = open(path, 'wb')
        f_in = gzip.GzipFile(fileobj=StringIO.StringIO((r.content)))
        f_out.writelines(f_in)
        f_out.close()
        f_in.close()
    else:
        raise Exception(
            "Something is wrong with the url provided: %s.\n Please attempt downloading files manually" %
            url)
Ejemplo n.º 26
0
def url_to_local_p_gz(url, path):
    """
    Copies a file from an http or ftp url to a local destination provided in path
    :param url:
    :param path:
    :return:
    """
    if url[:3] == 'ftp':
        requests_ftp.monkeypatch_session()
        s = requests.Session()
        r = s.retr(url)
    else:
        r = requests.get(url, stream=True)
    if r.status_code in ['226', 200, 226, '200']:
        r.raw.decode_content = True
        f_out = open(path, 'wb')
        f_in = gzip.GzipFile(fileobj=StringIO.StringIO((r.content)))
        f_out.writelines(f_in)
        f_out.close()
        f_in.close()
    else:
        raise Exception(
            "Something is wrong with the url provided: %s.\n Please attempt downloading files manually"
            % url)
Ejemplo n.º 27
0
def ftp_fetch(context, data):
    url = data.get('url')
    context.log.info("FTP fetch: %s", url)
    requests_ftp.monkeypatch_session()
    session = requests.Session()
    username = context.get('username', 'Anonymous')
    password = context.get('password', 'anonymous@ftp')

    resource = urlparse(url).netloc or url
    # a bit weird to have a http rate limit while using ftp
    limit = context.get('http_rate_limit', settings.HTTP_RATE_LIMIT)
    rate_limit = get_rate_limit(resource, limit=limit)

    cached = context.get_tag(url)
    if cached is not None:
        context.emit(rule='pass', data=cached)
        return

    rate_limit.comply()
    resp = session.retr(url, auth=(username, password))
    if resp.status_code < 399:
        data.update({
            'status_code': resp.status_code,
            'retrieved_at': datetime.utcnow().isoformat(),
            'content_hash': context.store_data(data=resp.content)
        })
        context.set_tag(url, data)
        context.emit(rule='pass', data=data)
    else:
        rate_limit.comply()
        resp = session.nlst(url, auth=(username, password))
        for child in resp.iter_lines(decode_unicode=True):
            child_data = data.copy()
            child_data['url'] = os.path.join(url, child)
            context.log.info("FTP directory child: %(url)s", child_data)
            context.emit(rule='child', data=child_data)
Ejemplo n.º 28
0
import itertools
import re

from os import mkdir
from hashlib import md5
from os.path import join, basename, exists, abspath, dirname, splitext
from urllib.parse import urlparse
from subprocess import check_output
from tempfile import mkstemp
from hashlib import sha1
from shutil import move
from time import time

import requests
import requests_ftp
requests_ftp.monkeypatch_session()

# HTTP timeout in seconds, used in various calls to requests.get() and requests.post()
_http_timeout = 180

from .compat import csvopen, csvDictWriter
from .conform import X_FIELDNAME, Y_FIELDNAME, GEOM_FIELDNAME

def mkdirsp(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise
Ejemplo n.º 29
0
def check_url(url, auth, org_check):
    """Check whether the given URL is dead or alive.

    Returns a dict with four keys:

        "url": The URL that was checked (string)
        "alive": Whether the URL was working, True or False
        "status": The HTTP status code of the response from the URL,
            e.g. 200, 401, 500 (int)
        "reason": The reason for the success or failure of the check,
            e.g. "OK", "Unauthorized", "Internal Server Error" (string)

    The "status" may be None if we did not get a valid HTTP response,
    e.g. in the event of a timeout, DNS failure or invalid HTTP response.

    The "reason" will always be a string, but may be a requests library
    exception string rather than an HTTP reason string if we did not get a valid
    HTTP response.

    """
    data_provider_credentials = auth  #Auth for CMEMS

    result = {"url": url}
    try:
        if "ftp://" in url:  #Connection for FTP protocol
            print("Ftp Connection")
            requests_ftp.monkeypatch_session()  #Adds helpers for FTPConnection
            s = requests.Session()  #Raises request session with FTPAdapter
            response = s.get(url,
                             auth=(data_provider_credentials[0],
                                   data_provider_credentials[1]))
        else:  #Http/Https request
            s = requests.Session()
            if data_provider_credentials[0]:  #Connection that needs auth
                print("Connection with credentials")
                s.auth = (data_provider_credentials[0],
                          data_provider_credentials[1])
                response = s.get(url)
                time.sleep(3)
            else:  #Connection that doesnt need auth
                print("Connection without credentials")
                response = s.get(url)
        result["status"] = response.status_code
        result["reason"] = response.reason
        response.raise_for_status()  # Raise if status_code is not OK.
        result["alive"] = True
    except AttributeError as err:
        if err.message == "'NoneType' object has no attribute 'encode'":
            # requests seems to throw these for some invalid URLs.
            result["alive"] = False
            result["reason"] = "Invalid URL"
            result["status"] = None
        else:
            raise
    except requests.exceptions.RequestException as err:
        result["alive"] = False
        if "reason" not in result:
            result["reason"] = str(err)
        if "status" not in result:
            # This can happen if the response is invalid HTTP, if we get a DNS
            # failure, or a timeout, etc.
            result["status"] = None

    # We should always have these four fields in the result.
    assert "url" in result
    assert result.get("alive") in (True, False)
    assert "status" in result
    assert "reason" in result
    return result
Ejemplo n.º 30
0
 def __init__(self, host):
     requests_ftp.monkeypatch_session()
     self.s = requests.Session()
     self.host = host
Ejemplo n.º 31
0
import requests
import requests_ftp
from myftp import StcakQueue
requests_ftp.monkeypatch_session()  #补丁,一个特定效果的适应性改变代码
url = 'ftp://*.*.*.*'  #获取ftp路径
url_temp = url  #用temp存储没转换之前的路径名称
url = url.encode('utf-8').decode('latin1')  #将路径先编码再解码,解决路径中含有中文名问题
s = requests.Session()  #实例化
res = s.list(url, auth=('', ''))  #进行ftp连接
res.encoding = 'utf-8'
url = url_temp  #将路径名赋为没编码之前路径,解决编码之后路径查找不到问题
print(res.text)  #输出文件类型,文件名,日期等信息;是一个字符串
str = res.text.split('\r\n')  #按换行符将字符串分割
queue = StcakQueue()  #定义队列,存储每个文件信息
for i in range(0, len(str) - 1):  #将根目录下每个文件夹提取
    a = str[i].split()  #按空格分割每行字符串
    b = a  #获取字符串列表
    if b[0] == 'drwxr-xr-x':  #是文件夹则压入队列
        if len(b) > 9:  #整合文件名,由于文件名可能带空格,因此需要整合完整的文件名,第八个字符串之后为文件名
            name = b[8]
            for i in range(9, len(b)):
                name += ' ' + b[i]
            queue.enqueue(name)  #文件名压入队列
        else:
            queue.enqueue(b[8])
urls = StcakQueue()  #存储ftp路径
urls.enqueue(url)  #将初始路径入队
temp = StcakQueue()  #用于更换队列
num = StcakQueue()  #计数
while queue.is_empty() == True:  #或队列此时为空,则代表这一层次的文件名被访问完
    url_1 = urls.top()
Ejemplo n.º 32
0
def pytest_configure(config):
    requests_ftp.monkeypatch_session()
Ejemplo n.º 33
0
def run_data_preparation(source_path, destination_path, dict_urls,
                         kraken_usage):
    try:
        if not path.exists(destination_path):
            makedirs(destination_path)
        if (kraken_usage):
            dest_raw_path = str(
                Path(destination_path).resolve().joinpath('kraken'))
        else:
            dest_raw_path = str(
                Path(destination_path).resolve().joinpath('raw'))

        if not path.exists(dest_raw_path):
            makedirs(dest_raw_path)
        if not path.exists(source_path):
            makedirs(source_path)

        if len(dict_urls):  # data come from URLs
            for new_file_name, url in dict_urls.items():
                # m1 = re.search('google\.com.+id=(.+)\&*', url) ## Retired
                m1 = re.search('google\.com\/file\/d\/(.+)\/', url)
                if m1:
                    # Use GoogleDriveDownloader module
                    id = m1.group(1)
                    response = GoogleDriveDownloader.get_response(id)
                else:
                    # Direct download
                    if not re.match(r'^(http|https|ftp)://', url):
                        url = 'http://' + url
                    requests_ftp.monkeypatch_session()
                    session = requests.Session()
                    response = session.get(url, stream=True)

                # Check file existing
                if response.ok == False:
                    return -1
                else:
                    # Get the file name and extension
                    if m1:
                        m2 = re.search('filename="(.+)"',
                                       response.headers['Content-Disposition'])
                        file_name = m2.group(1)
                    else:
                        file_name = response.url.split('/')[-1]
                    extension = path.splitext(file_name)[1]

                    # Check file format
                    if new_file_name == 'reference.fa':
                        if not re.search('\.(fasta|fa|fna)+(\.gz)*$',
                                         file_name):
                            return -1
                    else:
                        if not re.search('\.f(ast)*q(\.gz)*$', file_name):
                            return -1

                    # Save a downloaded file to disk
                    if extension == '.gz':
                        with open(
                                str(
                                    Path(source_path).resolve().joinpath(
                                        new_file_name)), "wb") as destination:
                            with gzip.GzipFile(fileobj=response.raw) as source:
                                shutil.copyfileobj(source, destination)
                    else:
                        with open(
                                str(
                                    Path(source_path).resolve().joinpath(
                                        new_file_name)), "wb") as destination:
                            for chunk in response.iter_content(32768):
                                if chunk:  # filter out keep-alive new chunks
                                    destination.write(chunk)

        #save files to UID/raw
        #shutil.copy2(str(Path(source_path).resolve()), dest_raw_path)
        shutil.copy2(str(Path(source_path).resolve().joinpath('R1.fastq')),
                     dest_raw_path)
        shutil.copy2(str(Path(source_path).resolve().joinpath('R2.fastq')),
                     dest_raw_path)
        shutil.rmtree(source_path)
        return 0

    except Exception as e:
        print(e)
        return -1
Ejemplo n.º 34
0
import warnings
import numpy as np

import requests

import pandas.compat as compat
from pandas import Panel, DataFrame
from pandas import read_csv
from pandas.io.common import urlencode
from pandas.compat import StringIO, bytes_to_str

from pandas_datareader._utils import (RemoteDataError, SymbolWarning,
                                      _sanitize_dates, _init_session)

import requests_ftp
requests_ftp.monkeypatch_session()


class _BaseReader(object):

    """

    Parameters
    ----------
        sym : string with a single Single stock symbol (ticker).
        start : string, (defaults to '1/1/2010')
                Starting date, timestamp. Parses many different kind of date
                representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
        end : string, (defaults to today)
                Ending date, timestamp. Same format as starting date.
        retry_count : int, default 3
Ejemplo n.º 35
0
class FTP_REQUESTS(object):
    '''
        通过requests操作FTP文件
        其实底层还是FTP还是Soket
        可以用于Web项目使用作参考(其实也没有那个必要)
    '''
    import requests, requests_ftp
    requests_ftp.monkeypatch_session()
    s = requests.Session()

    def __init__(self, ftp_url: str = 'ftp://ftp.ptree.jaxa.jp', username: str = '15174506817_163.com',
                 password: str = 'SP+wari8', s: requests_ftp.ftp.FTPSession = s):
        self.s = s
        self.ftp_url = ftp_url
        self.username = username
        self.password = password

    # 时间转换url
    def get_timeToUrl(self, ftp_url: str = None, tim=None):
        '''
        :param ftp_url: FTP 远程路径
        :param tim: 时间
        :return: url
        默认返回路径:ftp://ftp.ptree.jaxa.jp/jma/hsd/
        '''
        strp = ''
        strf = ''
        if ftp_url == None:
            ftp_url = self.ftp_url + '/jma/hsd/'
        if tim == None:
            if bool(re.search(r'^[\d]{9,10}$', str(tim))):
                strp = "%Y%m%d%H"
                strf = '%Y%m/%d/%H/'
            elif bool(re.search(r'^[\d]{7,8}$', str(tim))):
                strp = "%Y%m%d"
                strf = '%Y%m/%d/'
            elif bool(re.search(r'\d{6}', str(tim))):
                strp = "%Y%m"
                strf = '%Y%m/'
            if strp != '' and strf != '':
                ftp_url = time.strftime(ftp_url + strf, time.strptime(str(tim), strp))
        return ftp_url

    # FTP根据url和过滤条件,返回一个list
    def get_ftp_urls(self, remotepath, conditions=None):
        '''
        :param remotepath: ftp远程路径
        :param conditions: 过滤条件
        :return: list result
        '''
        socket.setdefaulttimeout(6)
        try:
            if remotepath == None:
                remotepath = self.ftp_url
            resp = self.s.list(remotepath, auth=(self.username, self.password))
            datas_urls = []
            if resp.status_code == 226:
                print('226  Transfer complete')
                if conditions is not None:
                    fliter_name = '.*' + conditions + '.*'
                    for i in resp.text.split('\n'):
                        s = re.finditer(fliter_name, i)
                        for i in s:
                            datas_urls.append(i.group())
                else:
                    for i in resp.text.split('\n'):
                        datas_urls.append(i)
            elif 400 <= resp.status_code < 500:
                if resp.status_code == 404:
                    print("目录或文件不存在!")
                raise u'%s Client Error: %s for url: %s' % (resp.status_code, remotepath)
            return datas_urls
        except(socket.error, socket.gaierror):
            print("\033[0;32;40mERROR: 链接超时: [{}:{}]\033[0m".format('get_ftp_urls', remotepath))
        return None

    def download_file(self, ftp_file_path: str or FTPFileApi, dst_file_path):
        """
        从ftp下载文件到本地
        :param ftp_file_path: ftp下载文件
        :param dst_file_path: 本地存放
        :return:
        """
        if isinstance(ftp_file_path, FTPFileApi):
            remote_file = ftp_file_path.remotepath
            # 文件总大小
            remote_file_size = ftp_file_path.size
        else:
            remote_file = ftp_file_path
            # 文件总大小
            remote_file_size = self.s.size(remote_file,
                                           auth=(self.username, self.password))
        if 400 <= remote_file_size.status_code < 500:
            if remote_file_size.status_code == 404:
                print("目录或文件不存在!")
                # raise (u'%s Client Error: %s for url: %s' % (remote_file_size.status_code, remote_file))
            return 0
        else:
            remote_file_size = int(remote_file_size.headers.get('Content-Length'))
            print('remote filesize [{}]'.format(remote_file_size))
        cmpsize = 0  # 下载文件初始大小
        lsize = 0
        # check local file isn't exists and get the local file size
        # 实现断点续传
        if os.path.exists(dst_file_path):
            lsize = os.stat(dst_file_path).st_size
            if lsize >= remote_file_size:
                print('local file({}b) is bigger or equal remote file({}b)'.format(lsize, remote_file_size))
                return 1
        start = time.time()

        headers = {'Range': 'bytes={}-'.format(lsize)}

        retrs = self.s.retr(remote_file,
                            auth=(self.username, self.password), headers=headers, stream=True)

        if 400 <= retrs.status_code < 500:
            if retrs.status_code == 404:
                print("目录或文件不存在!")
            raise u'%s Client Error: %s for url: %s' % (retrs.status_code, remote_file)
            return 0

        with open(dst_file_path, "ab") as data:
            data.write(retrs.content)
        end = time.time()
        print(remote_file + '完成!花费时间:', (end - start))
Ejemplo n.º 36
0
def pytest_configure(config):
    requests_ftp.monkeypatch_session()