Exemple #1
0
    def save(self, directory):
        """Save files in specified directory.

        Each txt url looks something like:
        https://www.sec.gov/Archives/edgar/data/1018724/000101872419000043/0001018724-19-000043.txt

        Args:
            directory (str): Path to directory where files should be saved.

        Returns:
            None

        Raises:
            ValueError: If no text urls are available for given filing object.
        """
        urls = self.get_urls()
        if all(len(urls[cik]) == 0 for cik in urls.keys()):
            raise ValueError("No filings available.")

        for cik, links in urls.items():
            for link in links:
                data = requests.get(link).text
                accession_number = link.split("/")[-1]
                path = os.path.join(directory, cik, self.filing_type.value)
                make_path(path)
                path = os.path.join(path, accession_number)
                with open(path, "w") as f:
                    f.write(data)
Exemple #2
0
    def save(self, directory):
        """Save all daily filings.

        Will store all filings for each unique company name under a separate subdirectory
        within given directory argument.

        Ex:
        my_directory
        |
        ---- Apple Inc.
             |
             ---- ...txt files
        ---- Microsoft Corp.
             |
             ---- ...txt files

        Args:
            directory (str): Directory where filings should be stored. Will be broken down
                further by company name and form type.
        """
        self.get_filings_dict()
        for filings in self._filings_dict.values():
            # take the company name from the first filing and make that the subdirectory name
            subdirectory = os.path.join(directory, filings[0].company_name)
            make_path(subdirectory)
            for filing in filings:
                filename = filing.file_name.split('/')[-1]
                filing_path = os.path.join(subdirectory, filename)
                url = self.make_url(filename)
                data = requests.get(url).text
                with open(filing_path, 'w') as f:
                    f.write(data)
Exemple #3
0
    def save(self, directory):
        """Save files in specified directory.
        Each txt url looks something like:
        https://www.sec.gov/Archives/edgar/data/1018724/000101872419000043/0001018724-19-000043.txt

        Args:
            directory (str): Path to directory where files should be saved.

        Returns:
            None

        Raises:
            ValueError: If no text urls are available for given filing object.
        """
        urls = self.get_urls()
        if len(urls) == 0:
            raise ValueError("No filings available.")
        doc_names = [url.split("/")[-1] for url in urls]
        for (url, doc_name) in list(zip(urls, doc_names)):
            cik = doc_name.split('-')[0]
            data = requests.get(url).text
            path = os.path.join(directory, cik, self.filing_type.value)
            make_path(path)
            path = os.path.join(path, doc_name)
            with open(path, "w") as f:
                f.write(data)
Exemple #4
0
    def save_filings(self, directory):
        """Save all filings.

        Will store all filings for each unique CIK under a separate subdirectory
        within given directory argument.

        Ex:
        my_directory
        |
        ---- CIK 1
             |
             ---- ...txt files
        ---- CIK 2
             |
             ---- ...txt files

        Args:
            directory (str): Directory where filings should be stored.
        """
        urls = self._check_urls_exist()

        for company, links in urls.items():
            for link in links:
                data = requests.get(link).text
                path = os.path.join(directory, company)
                make_path(path)
                path = os.path.join(path, self.get_accession_number(link))
                with open(path, "w") as f:
                    f.write(data)
Exemple #5
0
    def save_filings(self, directory):
        """Save all filings.

        Will store all filings for each unique company name under a separate subdirectory
        within given directory argument.

        Ex:
        my_directory
        |
        ---- Apple Inc.
             |
             ---- ...txt files
        ---- Microsoft Corp.
             |
             ---- ...txt files

        Args:
            directory (str): Directory where filings should be stored. Will be broken down
                further by company name and form type.
        """
        self.get_filings_dict()
        for filings in self._filings_dict.values():
            # take the company name from the first filing and make that the subdirectory name
            clean_company_name = self.clean_directory_path(
                filings[0].company_name)
            subdirectory = os.path.join(directory, clean_company_name)
            # TODO: Clean company name to make valid directory name (get rid of special characters)
            make_path(subdirectory)
            for filing in filings:
                filename = self.get_accession_number(filing.file_name)
                filing_path = os.path.join(subdirectory, filename)
                url = self.make_url(filename)
                data = requests.get(url).text
                with open(filing_path, 'w') as f:
                    f.write(data)
Exemple #6
0
 def test_make_path_expand_user(self):
     # make sure that you do not have a directory matching this if testing locally
     path_to_expand = "~/_____testing_____"
     utils.make_path(path_to_expand)
     path_expanded = os.path.expanduser(path_to_expand)
     try:
         assert os.path.exists(path_expanded)
     finally:
         os.rmdir(path_expanded)
Exemple #7
0
    def _save_filings(self,
                      directory,
                      dir_pattern="{cik}",
                      file_pattern="{accession_number}",
                      download_all=False):
        """Save all filings.

        Will store all filings under the parent directory of ``directory``, further
        separating filings using ``dir_pattern`` and ``file_pattern``.

        Args:
            directory (str): Directory where filings should be stored.
            dir_pattern (str): Format string for subdirectories. Default is `{cik}`.
                Valid options are `{cik}`.
            file_pattern (str): Format string for files. Default is `{accession_number}`.
                Valid options are `{accession_number}`.
            download_all (bool): Type of downloading system, if true downloads all tar files,
                if false downloads each file in index. Default is `False`.
        """
        urls = self._check_urls_exist()

        if download_all:
            # Download tar files into huge temp directory
            extract_directory = os.path.join(directory, 'temp')
            i = 0
            while os.path.exists(extract_directory):
                # Ensure that there is no name clashing
                extract_directory = os.path.join(directory,
                                                 'temp{i}'.format(i=i))
                i += 1

            make_path(extract_directory)
            self._unzip(extract_directory=extract_directory)
            self._move_to_dest(urls=urls,
                               extract_directory=extract_directory,
                               directory=directory,
                               file_pattern=file_pattern,
                               dir_pattern=dir_pattern)

            # Remove the initial extracted data
            shutil.rmtree(extract_directory)
        else:
            inputs = []
            for company, links in urls.items():
                formatted_dir = dir_pattern.format(cik=company)
                for link in links:
                    formatted_file = file_pattern.format(
                        accession_number=self.get_accession_number(link))
                    path = os.path.join(directory, formatted_dir,
                                        formatted_file)
                    inputs.append((link, path))
            loop = asyncio.get_event_loop()
            loop.run_until_complete(
                self.client.wait_for_download_async(inputs))
Exemple #8
0
    def _do_create_and_copy(q):
        """Create path and copy file to end of path.

        Args:
            q (Queue.queue): Queue to get filename, new directory,
                and old path information from.
        """
        while True:
            try:
                filename, new_dir, old_path = q.get(timeout=1)
            except Empty:
                return
            make_path(new_dir)
            path = os.path.join(new_dir, filename)
            shutil.copyfile(old_path, path)
            q.task_done()
Exemple #9
0
 async def fetch_and_save(link, path, session):
     """Fetch link and save to path using session."""
     contents = await self.fetch(link, session)
     make_path(os.path.dirname(path))
     with open(path, "wb") as f:
         f.write(contents)
Exemple #10
0
    def process(self,
                infile,
                out_dir=None,
                create_subdir=True,
                rm_infile=False):
        """Process a text file and save processed files.

        Args:
            infile (str): Full path to a text file.
            out_dir (str): Directory to store output files. Defaults to the parent directory of
                infile.
            create_subdir (bool): If a subdirectory with the name of the infile should be created.
                If this is not true, files will be prefixed with the infile filename.
            rm_infile (bool): If the infile should be removed after processing. Defaults to False.

        Returns:
            None
        """
        if not infile.endswith('.txt'):
            raise ValueError(
                '{file} Does not appear to be a .txt file.'.format(
                    file=infile))

        with open(infile, encoding="utf8") as f:
            intxt = f.read()

        if out_dir is None:
            out_dir = os.path.dirname(infile)
        infile_base = os.path.basename(infile).split('.txt')[0]
        metadata_file_format = "{base}_{num}.metadata.json"
        document_file_format = '{base}_{sec_doc_num}.{file}'
        if create_subdir:
            out_dir = os.path.join(out_dir, infile_base)
            make_path(out_dir)
            metadata_file_format = "{num}.metadata.json"
            document_file_format = '{sec_doc_num}.{file}'
        sec_doc_cursor = 0
        sec_doc_count = intxt.count("<SEC-DOCUMENT>")
        for sec_doc_num in range(sec_doc_count):
            sec_doc_match = self.re_sec_doc.search(intxt, pos=sec_doc_cursor)
            if not sec_doc_match:
                break

            sec_doc_cursor = sec_doc_match.span()[1]
            sec_doc = sec_doc_match.group(1)

            # metadata
            metadata_match = self.re_sec_header.search(sec_doc)
            metadata_txt = metadata_match.group(1)
            metadata_cursor = metadata_match.span()[1]
            metadata_filename = metadata_file_format.format(base=infile_base,
                                                            num=sec_doc_num)
            metadata_file = os.path.join(out_dir, metadata_filename)
            metadata_dict = self.process_metadata(metadata_txt)
            # logging.info("Metadata written into {}".format(metadata_file))

            # Loop through every document
            metadata_dict["documents"] = []
            documents = sec_doc[metadata_cursor:].strip()
            doc_count = documents.count("<DOCUMENT>")
            doc_cursor = 0
            for doc_num in range(doc_count):
                doc_match = self.re_doc.search(documents, pos=doc_cursor)
                if not sec_doc_match:
                    break
                doc = doc_match.group(1)
                doc_cursor = doc_match.span()[1]
                doc_metadata = self.process_document_metadata(doc)
                metadata_dict["documents"].append(doc_metadata)

                # Get file data and file name
                doc_filename = doc_metadata["filename"]
                doc_txt = self.re_text.search(doc).group(1).strip()
                target_doc_filename = document_file_format.format(
                    base=infile_base,
                    sec_doc_num=sec_doc_num,
                    file=doc_filename)
                doc_outfile = os.path.join(out_dir, target_doc_filename)

                is_uuencoded = doc_txt.find("begin 644 ") != -1

                if is_uuencoded:
                    logging.info(
                        "{} contains an uu-encoded file".format(infile))
                    encfn = doc_outfile + ".uu"
                    with open(encfn, "w", encoding="utf8") as encfh:
                        encfh.write(doc_txt)
                    uu.decode(encfn, doc_outfile)
                    os.remove(encfn)
                else:
                    logging.info(
                        "{} contains an non uu-encoded file".format(infile))
                    with open(doc_outfile, "w", encoding="utf8") as outfh:
                        outfh.write(doc_txt)

            # Save SEC-DOCUMENT metadata to file
            with open(metadata_file, "w", encoding="utf8") as fileh:
                formatted_metadata = json.dumps(metadata_dict,
                                                indent=2,
                                                sort_keys=True,
                                                ensure_ascii=False)
                fileh.write(formatted_metadata)

        if rm_infile:
            os.remove(infile)
Exemple #11
0
 async def fetch_and_save(link, path, session):
     contents = await self.fetch(link, session)
     make_path(os.path.dirname(path))
     with open(path, "wb") as f:
         f.write(contents)