Esempio n. 1
0
def open(tarname, nojson=False):
        """Opens a tar file containing bzip2-compressed chunks of lines containing
        JSON objects.

        Use as an iterator, like this:

        for obj in tarchunk.open("blah.tar"):
            print o['text']

        for s in tarchunk.open("blah.tar", nojson=True):
            # s is a string
             
        """
        global good, bad
        tar = tarfile.open(tarname, mode='r|*')

        for tarinfo in tar:

            name = tarinfo.name
            try:   
                obj = tar.extractfile(tarinfo)
                if obj is None:
                        continue

                if nojson:
                    yield from bz2.open(obj)
                else:
                    for line in bz2.open(obj):
                        yield json.loads(line.decode('utf8'))
                
                good += 1

            except Exception as e:
                print("Choked on {0}: {1}".format(name, e))
                bad += 1
Esempio n. 2
0
def bz2_open(file, mode='r'):
    """Abstract the numerous ways BZ2 files are handled in Python.

    @param file:    The file path to open.
    @type file:     str
    @keyword mode:  The mode to open the file with.  Only the values of 'r' and 'w' for reading and writing respectively are supported.
    @type mode:     str
    @return:        The bzip2 file object.
    @rtype:         file object
    """

    # Check the mode.
    if mode not in ['r', 'w']:
        raise RelaxError("The mode '%s' must be one or 'r' or 'w'." % mode)

    # Check if the bz2 module exists.
    if not bz2_module:
        if mode == 'r':
            raise RelaxError("Cannot open the file %s, try uncompressing first.  %s." % (file, bz2_module_message))
        else:
            raise RelaxError("Cannot create bzip2 file %s, the bz2 Python module cannot be found." % file)

    # Open the file for reading.
    if mode == 'r':
        # Python 3.3 text mode.
        if sys.version_info[0] == 3 and sys.version_info[1] >= 3:
            file_obj = bz2.open(file, 't')

        # Python 3.0, 3.1 and 3.2 text mode.
        elif sys.version_info[0] == 3 and sys.version_info[1] < 3:
            file_obj = io.TextIOWrapper(Bzip2Fixed(file, 'r'))

        # Python 2 text mode.
        else:
            file_obj = bz2.BZ2File(file, 'r')

    # Open the file for writing.
    elif mode == 'w':
        # Python 3.3 text mode.
        if sys.version_info[0] == 3 and sys.version_info[1] >= 3:
            file_obj = bz2.open(file, 'wt')

        # Python 3.0, 3.1 and 3.2 text mode.
        elif sys.version_info[0] == 3 and sys.version_info[1] < 3:
            file_obj = io.TextIOWrapper(Bzip2Fixed(file, 'w'))

        # Python 2 text mode.
        else:
            file_obj = bz2.BZ2File(file, 'w')

    # Return the file object.
    return file_obj
Esempio n. 3
0
def open_file(filename, mode, encoding=None):
    import sys, io

    binary = mode.endswith("b")
    mode = mode.rstrip("b") + "b"

    if mode.startswith("r"):
        if filename == "-":
            fileobj = sys.stdin.buffer
        else:
            fileobj = open(filename, mode)

        buf = fileobj.peek(100)

        if buf.startswith(b"\x1f\x8b\x08"):
            import gzip
            fileobj = gzip.open(fileobj, mode)

        elif buf[0:3] == b"BZh" and buf[4:10] == b"1AY&SY":
            import bz2
            fileobj = bz2.open(fileobj, mode)

        elif buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
            import lzma
            fileobj = lzma.open(fileobj, mode)

    else:
        if filename == "-":
            fileobj = sys.stdout.buffer

        elif filename.endswith(".gz"):
            import gzip
            fileobj = gzip.open(filename, mode)

        elif filename.endswith(".bz2"):
            import bz2
            fileobj = bz2.open(filename, mode)

        elif filename.endswith(".xz"):
            import lzma
            fileobj = lzma.open(filename, mode)

        else:
            fileobj = open(filename, mode)

    if binary:
        return fileobj
    else:
        return io.TextIOWrapper(fileobj, encoding=encoding,
                errors="surrogateescape", line_buffering=True)
Esempio n. 4
0
 def _build_vocabulary(self, vocabulary_size):
     """
     Count words in the pages file and write a list of the most frequent
     words to the vocabulary file.
     """
     counter = collections.Counter()
     with bz2.open(self._pages_path, 'rt') as pages:
         for page in pages:
             words = page.strip().split()
             counter.update(words)
     common = ['<unk>'] + counter.most_common(vocabulary_size - 1)
     common = [x[0] for x in common]
     with bz2.open(self._vocabulary_path, 'wt') as vocabulary:
         for word in common:
             vocabulary.write(word + '\n')
Esempio n. 5
0
 def _read_pages(self, url):
     """
     Extract plain words from a Wikipedia dump and store them to the pages
     file. Each page will be a line with words separated by spaces.
     """
     wikipedia_path = download(url, self._cache_dir)
     with bz2.open(wikipedia_path) as wikipedia, \
             bz2.open(self._pages_path, 'wt') as pages:
         for _, element in etree.iterparse(wikipedia, tag='{*}page'):
             if element.find('./{*}redirect') is not None:
                 continue
             page = element.findtext('./{*}revision/{*}text')
             words = self._tokenize(page)
             pages.write(' '.join(words) + '\n')
             element.clear()
Esempio n. 6
0
def test_output_files(logger, args, result):
    if not args.publish_results:
        return
    # Only when there is an output directory does it need publishing.
    dstdir = _mkdir_test_output(logger, args, result)
    if not dstdir:
        return
    # copy plain text files
    good = re.compile(r"(\.txt|\.diff|^RESULT)$")
    log = re.compile(r"(\.log)$")
    for name in os.listdir(result.output_directory):
        # copy simple files
        src = os.path.join(result.output_directory, name)
        dst = os.path.join(dstdir, name)
        if os.path.isfile(dst) and os.path.samefile(src, dst):
            continue
        if os.path.isfile(dst) \
        and os.path.getmtime(src) < os.path.getmtime(dst):
            continue
        if good.search(name):
            logger.info("copying '%s' to '%s'", src, dst)
            shutil.copyfile(src, dst)
            continue
        # copy compressed files
        dst = dst + ".bz2"
        if os.path.isfile(dst) \
        and os.path.getmtime(src) < os.path.getmtime(dst):
            continue
        if log.search(name):
            logger.info("compressing '%s' to '%s'", src, dst)
            with open(src, "rb") as f:
                data = f.read()
            with bz2.open(dst, "wb") as f:
                f.write(data)
            continue
Esempio n. 7
0
def asHandle(fileNameOrHandle, mode='r'):
    """
    Decorator for file opening that makes it easy to open compressed files.
    Based on L{Bio.File.as_handle}.

    @param fileNameOrHandle: Either a C{str} or a file handle.
    @return: A generator that can be turned into a context manager via
        L{contextlib.contextmanager}.
    """
    if isinstance(fileNameOrHandle, six.string_types):
        if fileNameOrHandle.endswith('.gz'):
            if six.PY3:
                yield gzip.open(fileNameOrHandle, mode='rt', encoding='UTF-8')
            else:
                yield gzip.GzipFile(fileNameOrHandle)
        elif fileNameOrHandle.endswith('.bz2'):
            if six.PY3:
                yield bz2.open(fileNameOrHandle, mode='rt', encoding='UTF-8')
            else:
                yield bz2.BZ2File(fileNameOrHandle)
        else:
            with open(fileNameOrHandle) as fp:
                yield fp
    else:
        yield fileNameOrHandle
Esempio n. 8
0
def main():
    usage = 'usage: %prog [options] <trim_length> <fastq_file>'
    parser = OptionParser(usage)
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide trim length and FASTQ file')
    else:
        trim_length = int(args[0])
        fastq_file = args[1]

    if fastq_file[-3:] == '.gz':
        fastq_in = gzip.open(fastq_file, 'rt')
    elif fastq_file[-4:] == '.bz2':
        fastq_in = bz2.open(fastq_file, 'rt')
    else:
        fastq_in = open(fastq_file)

    header = fastq_in.readline().rstrip()
    while header:
        seq = fastq_in.readline().rstrip()
        mid = fastq_in.readline().rstrip()
        qual = fastq_in.readline().rstrip()

        # trim
        seq = seq[:trim_length]
        qual = qual[:trim_length]                  

        print('%s\n%s\n%s\n%s' % (header,seq,mid,qual))

        header = fastq_in.readline().rstrip()

    fastq_in.close()
Esempio n. 9
0
def unbzip(fname):
    tmp_fname = tempfile.mkstemp()[1]
    tmpf = open(tmp_fname, "w")
    bz = bz2.open(fname)
    tmpf.write(bz.read())
    tmpf.close()
    return tmp_fname
Esempio n. 10
0
 def commit(self):
     if not self.to_write:
         raise DBException('Need to be in write mode to commit')
     try:
         os.makedirs(self.node_dir)
     except FileExistsError:
         pass  # This is ok
     w = bz2.open(self.node_file + '.tmp', 'wt', encoding='utf-8')
     start_pos = self.key.get_last_key()
     if self.db.is_sparse:
         poses = []
         vals = []
         for i, val in enumerate(self._vals):
             if val is not None:
                 vals.append(val)
                 poses.append(i)
         w.write('\t'.join([str(x + start_pos) for x in poses]))
         w.write('\n')
         for v in vals:
             w.write('%s\n' % repr(v))
     else:
         for v in self._vals:
             w.write('%s\n' % repr(v))
     w.close()
     os.rename(self.node_file + '.tmp', self.node_file)
Esempio n. 11
0
 def xml_writer(self, filename):
     if self.xowa:
         try:
             while True:
                 line = (yield)
                 #f.write(line.encode('utf-8'))
                 print(line, end='')
         except GeneratorExit:
             pass
         logging.info('XML-Stream: %s done.', filename)
     elif self.compress:
         with bz2.open(filename+'.bz2', 'w') as f:
             try:
                 while True:
                     line = (yield)
                     f.write(line.encode('utf-8'))
             except GeneratorExit:
                 pass
         logging.info('File: %s.bz2 done.', filename)
     else:
         with open(filename, 'w', encoding='utf-8') as f:
             try:
                 while True:
                     line = (yield)
                     f.write(line)
             except GeneratorExit:
                 pass
         logging.info('File: %s done.', filename)
Esempio n. 12
0
def zopen(filename, *args, **kwargs):
    """
    This function wraps around the bz2, gzip and standard python's open
    function to deal intelligently with bzipped, gzipped or standard text
    files.

    Args:
        filename (str/Path): filename or pathlib.Path.
        \*args: Standard args for python open(..). E.g., 'r' for read, 'w' for
            write.
        \*\*kwargs: Standard kwargs for python open(..).

    Returns:
        File-like object. Supports with context.
    """
    if Path is not None and isinstance(filename, Path):
        filename = str(filename)

    name, ext = os.path.splitext(filename)
    ext = ext.upper()
    if ext == ".BZ2":
        if PY_VERSION[0] >= 3:
            return bz2.open(filename, *args, **kwargs)
        else:
            args = list(args)
            if len(args) > 0:
                args[0] = "".join([c for c in args[0] if c != "t"])
            if "mode" in kwargs:
                kwargs["mode"] = "".join([c for c in kwargs["mode"]
                                          if c != "t"])
            return bz2.BZ2File(filename, *args, **kwargs)
    elif ext in (".GZ", ".Z"):
        return gzip.open(filename, *args, **kwargs)
    else:
        return io.open(filename, *args, **kwargs)
Esempio n. 13
0
 def make_available(self):
     bz2filename = "{}.bz2".format(self.filename)
     if not os.path.isfile(bz2filename):
         download_and_save_file(self.url, bz2filename)
     with bz2.open(bz2filename, 'r') as f:
         data = f.read().decode("ascii")
         save_file(data, self.filename)
Esempio n. 14
0
    def _open(self, filename):
        """
        Open the input file. Set self._fp to point to it. Read the first
        line of parameters.

        @param filename: A C{str} filename containing JSON BLAST records.
        @raise ValueError: if the first line of the file isn't valid JSON,
            if the input file is empty, or if the JSON does not contain an
            'application' key.
        """
        if filename.endswith('.bz2'):
            if six.PY3:
                self._fp = bz2.open(filename, mode='rt', encoding='UTF-8')
            else:
                self._fp = bz2.BZ2File(filename)
        else:
            self._fp = open(filename)

        line = self._fp.readline()
        if not line:
            raise ValueError('JSON file %r was empty.' % self._filename)

        try:
            self.params = loads(line[:-1])
        except ValueError as e:
            raise ValueError(
                'Could not convert first line of %r to JSON (%s). '
                'Line is %r.' % (self._filename, e, line[:-1]))
        else:
            if 'application' not in self.params:
                raise ValueError(
                    '%r appears to be an old JSON file with no BLAST global '
                    'parameters. Please re-run convert-blast-xml-to-json.py '
                    'to convert it to the newest format.' % self._filename)
Esempio n. 15
0
File: io.py Progetto: dwinston/monty
def zopen(filename, *args, **kwargs):
    """
    This function wraps around the bz2, gzip and standard python's open
    function to deal intelligently with bzipped, gzipped or standard text
    files.

    Args:
        filename (str): filename
        \*args: Standard args for python open(..). E.g., 'r' for read, 'w' for
            write.
        \*\*kwargs: Standard kwargs for python open(..).

    Returns:
        File-like object. Supports with context.
    """
    file_ext = filename.split(".")[-1].upper()
    if file_ext == "BZ2":
        if PY_VERSION[0] >= 3:
            return bz2.open(filename, *args, **kwargs)
        else:
            args = list(args)
            if len(args) > 0:
                args[0] = "".join([c for c in args[0] if c != "t"])
            if "mode" in kwargs:
                kwargs["mode"] = "".join([c for c in kwargs["mode"] if c !=
                                          "t"])
            return bz2.BZ2File(filename, *args, **kwargs)
    elif file_ext in ("GZ", "Z"):
        return gzip.open(filename, *args, **kwargs)
    else:
        return open(filename, *args, **kwargs)
Esempio n. 16
0
def get_uncompressed_stream(input_stream, compression="auto"):
    """
    Returns a file-like object (aka stream) providing an uncompressed
    version of the content read on the input stream provided.

    :param input_stream: The file-like object providing compressed data.
    :param compression: The compression type. Specify "auto" to let the function
        guess it out of the associated filename (the input_stream needs to have
        a name attribute, otherwise a ValueError is raised).
    :type compression: str
    """

    if compression == "auto":  # Try to guess compression method if possible
        if hasattr(input_stream, 'name'):
            compression = guess_compression_method(input_stream.name)
        else:
            raise ValueError("Can't retrieve a name out of %r" % input_stream)

    if compression == "gzip":
        import gzip
        return gzip.open(filename=input_stream, mode="rb")
    elif compression == "bzip2":
        import bz2
        return bz2.open(filename=input_stream, mode="rb")
    elif compression == "xz":
        import lzma
        return lzma.open(filename=input_stream, mode="rb")
    elif compression is None:
        return input_stream
    else:
        raise NotImplementedError(
            "Unknown compression method: %r" % compression)
Esempio n. 17
0
 def __enter__(self):
     if self.f_name is None:
         self.logger.error('File name cannot be empty.')
     elif self.pat_archive.search(self.f_name):
         self.logger.warning('File \'{f}\' not a supported extension.'.format(f=self.f_name))
     elif not os.access(self.f_name, os.R_OK):
         self.logger.warning('File \'{f}\'cannot be read.'.format(f=self.f_name))
     else:
         f_mode = 'rt'
         f_codec = locale.getpreferredencoding(False)  # Or 'UTF-8'
         f_err = 'surrogateescape'  # Or 'ignore'
         try:
             if self.f_name.endswith('.gz') or self.f_name.endswith('.gzip'):
                 with open(self.f_name, 'rb') as byte_handle:
                     if bytearray.fromhex('1f8b08') in byte_handle.read(3):
                         self.handle = gzip.open(self.f_name, mode=f_mode, encoding=f_codec, errors=f_err)
             elif self.f_name.endswith('.bz') or self.f_name.endswith('.bz2'):
                 with open(self.f_name, 'rb') as byte_handle:
                     if bytearray.fromhex('425a68') in byte_handle.read(3):
                         self.handle = bz2.open(self.f_name, mode=f_mode, encoding=f_codec, errors=f_err)
             # elif self.f_name.endswith('.lzma') or self.f_name.endswith('.lzma'):
             #    with open(self.f_name, 'rb') as byte_handle:
             #        if bytearray.fromhex('5d0000') in byte_handle.read(3):
             #            self.handle = lzma.open(self.f_name, mode=f_mode, encoding=f_codec, errors=f_err)
             else:
                 self.handle = open(self.f_name, mode=f_mode, encoding=f_codec, errors=f_err)
             return self
         except IOError:
             self.logger.error('Exception opening \'{f}\'.'.format(f=self.f_name))
             return self
Esempio n. 18
0
 def load_model(self):
     if not os.path.exists(self.get_filename(absolute=True)):
         if args.train: return {}, {}
         error("Model file with pre-trained convolution layers not found. Download it here...",
               "https://github.com/alexjc/neural-enhance/releases/download/v%s/%s"%(__version__, self.get_filename()))
     print('  - Loaded file `{}` with trained model.'.format(self.get_filename()))
     return pickle.load(bz2.open(self.get_filename(absolute=True), 'rb'))
Esempio n. 19
0
    def prepare_command_line(self):
        '''
        Develops the Commandline to run FastQC in Galaxy
        '''
        
        # Check whether a given file compression format is valid
        # This prevents uncompression of already uncompressed files
#        infname = self.opts.inputfilename
        infname = self.opts.input ### http://dev.list.galaxyproject.org/FastQC-wrapper-not-seeing-files-at-gzipped-td4666363.html
        linf = infname.lower()
        trimext = False
        # decompression at upload currently does NOT remove this now bogus ending - fastqc will barf
        # patched may 29 2013 until this is fixed properly
        if ( linf.endswith('.gz') or linf.endswith('.gzip') ): 
            f = gzip.open(self.opts.input)
            try:
                f.readline()
            except:
                trimext = True
            f.close()
        elif linf.endswith('bz2'):
            f = bz2.open(self.opts.input,'rb')
            try:
                f.readline()
            except:
                trimext = True
            f.close()
        elif linf.endswith('.zip'):
            if not zipfile.is_zipfile(self.opts.input):
                trimext = True
        if trimext:
	   f = open(self.opts.input)
	   try:
	       f.readline()
	   except:
	       raise Exception("Input file corruption, could not identify the filetype")
           infname = os.path.splitext(infname)[0]
        
        # Replace unwanted or problematic charaters in the input file name
        self.fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname))
        # check that the symbolic link gets a proper ending, fastqc seems to ignore the given format otherwise
        if 'fastq' in opts.informat:
            # with fastq the .ext is ignored, but when a format is actually passed it must comply with fastqc's 
            # accepted formats..
            opts.informat = 'fastq'
        elif not self.fastqinfilename.endswith(opts.informat):
            self.fastqinfilename += '.%s' % opts.informat

        # Build the Commandline from the given parameters
        command_line = [opts.executable, '--outdir %s' % opts.outputdir]
        if opts.contaminants != None:
            command_line.append('--contaminants %s' % opts.contaminants)
        if opts.limits != None:
	    command_line.append('--limits %s' % opts.limits)
        command_line.append('--quiet')
        command_line.append('--extract') # to access the output text file
        command_line.append(self.fastqinfilename)
        command_line.append('-f %s' % opts.informat)
        command_line.append('-t ${GALAXY_SLOTS:-4}')
        self.command_line = ' '.join(command_line)
Esempio n. 20
0
 def __iter__(self):
     """Iterate over pages represented as lists of word indices."""
     with bz2.open(self._pages_path, 'rt') as pages:
         for page in pages:
             words = page.strip().split()
             words = [self.encode(x) for x in words]
             yield words
def getSiteInfoFromBz2File(somebz2filename):
    wikiurl = ''
    nses = []
    try:
        with bz2.open(somebz2filename, mode = 'rt') as fsource:
            innamespaces = False
            for line in fsource:
                if innamespaces:                    
                    if line.strip() == '</namespaces>':
                        break
                    line = line.split('key="')[1]
                    num = line.split('"')[0]
                    if line.find('<') > 1:
                        d = line.split('>')[1].split('<')[0]
                        nses.append(num + '#' + d)
                    continue
                if line.strip() == '<namespaces>':
                    innamespaces = True
                    nses.append('0#main')
                #b = myre1.split(line)
                matchobject = re.search(r'<base>(?P<url>.*?)</base>', line.strip())
                if matchobject:
                    #wikiurl = matchobject.group('url')
                    c = urllib.parse.urlparse(matchobject.group('url'))
                    wikiurl =urllib.parse.urlunparse((c[0],c[1],'','','','')) + '/'
                    #wikiurl = urllib.parse.unquote(matchobject.group('url'))
                    print(wikiurl)
        return wikiurl, nses
    except:
        raise
Esempio n. 22
0
def open_zipped(infile, mode='r'):
    """return file handle of file regardless of compressed or not.

    also returns already opened files unchanged, text mode automatic for
    compatibility with python2.
    """
    # return already open files
    if hasattr(infile, 'write'):
        return infile
    # make text mode automatic
    if len(mode) == 1:
        mode = mode + 't'
    # refuse to handle non-strings that aren't files.
    if not isinstance(infile, str):
        raise ValueError("i cannot open a filename that isn't a string.")
    # treat '-' appropriately
    if infile == '-':
        if 'w' in mode:
            return sys.stdout
        return sys.stdin
    # if possible open zipped files
    if infile.endswith('.gz'):
        return _gzip.open(infile, mode)
    if infile.endswith('.bz2'):
        if hasattr(_bz2, 'open'):
            return _bz2.open(infile, mode)
        return _bz2.bz2file(infile, mode)
    # fall back on regular open
    return open(infile, mode)
Esempio n. 23
0
def read_bz2(filepath):
    ''' This opens a bzip file, assuming that it contains a HathiTrust
    feature JSON, and extracts a list of pages.
    '''
    successflag = 'success'

    try:
        with bz2.open(filepath, mode='rt', encoding = 'utf-8') as f:
            jsonstring = f.read()
    except:
        successflag = 'bzip2 failed'
        jsonstring = ''

    try:
        jobj = json.loads(jsonstring)
    except:
        successflag = 'json decoding failed'
        jobj = dict()

    try:
        pagelist = jobj['features']['pages']
    except:
        if successflag == 'success':
            successflag = 'json format unexpected'
        pagelist = []
        # If statement because we don't want to overwrite previous failures.

    return pagelist, successflag
Esempio n. 24
0
    def _read_xml(self, xml_file):
        """Salva no banco de dados do Django e retorna lista das votações"""

        tree = None
        with bz2.open(xml_file, mode='rt', encoding="iso-8859-1") as f:
            tree = etree.fromstring(f.read())
        return tree
Esempio n. 25
0
def interlanguage_mapping(interlang_path, ok_concepts):
    quads = parse_nquads(bz2.open(str(interlang_path), 'rt'))
    mapping = {}
    for subj, values in itertools.groupby(quads, itemgetter(0)):
        subj_url = subj['url']
        subj_concept = translate_dbpedia_url(subj_url)
        pieces = split_uri(subj_concept)
        if len(pieces) >= 6:
            sense = pieces[5]
            if 'album' in sense or 'film' in sense or 'series' in sense or 'disambiguation' in sense or 'song' in sense or 'album' in sense or 'band' in sense:
                continue
        if uri_prefix(subj_concept) in ok_concepts:
            targets = [subj_url]

            for _subj, _pred, obj, _graph in values:
                url = obj['url']
                if 'www.wikidata.org' in url:
                    continue
                if url.startswith('http://wikidata.dbpedia.org/'):
                    wikidata_id = resource_name(url)

                    # Return early when we see a high-numbered Wikidata ID
                    if int(wikidata_id[1:]) >= 1000000:
                        return mapping
                targets.append(url)

            mapping[subj_url] = targets
    return mapping
Esempio n. 26
0
    def from_file(self, filename):
        """
        Read data from the file and return an intensity object wtih
        that data.
        """
        self.filename = filename
        
        raw_counts = list()

        if not os.path.exists(filename):
            bz2_name = "{}.bz2".format(filename)
            if os.path.exists(bz2_name):
                filename = bz2_name
                
        if filename.endswith("bz2"): 
            open_f = lambda x: bz2.open(x, "rt")
        else:
            open_f = open
        with open_f(filename) as stream_in:
            for line in csv.reader(stream_in):
                bin_left = int(line[0])
                bin_right = int(line[1])

                counts = tuple(map(int, line[2:]))

                self.times.append((bin_left, bin_right))

                raw_counts.append(counts)

        for channel, counts in enumerate(numpy.transpose(raw_counts)):
            self[channel] = counts
    
        return(self)
Esempio n. 27
0
def main():
    usage = 'usage: %prog [options] <fastq_file>'
    parser = OptionParser(usage)
    parser.add_option('-l', dest='length_min', default=None, type='int', help='Minimum read length')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide FASTQ file')
    else:
        fastq_file = args[0]

    if fastq_file[-3:] == '.gz':
        fastq_in = gzip.open(fastq_file, 'rt')
    elif fastq_file[-4:] == '.bz2':
        fastq_in = bz2.open(fastq_file, 'rt')
    else:
        fastq_in = open(fastq_file)

    header = fastq_in.readline()
    while header:
        seq = fastq_in.readline()
        mid = fastq_in.readline()
        qual = fastq_in.readline()

        if options.length_min is not None:
            if len(seq)-1 >= options.length_min:
                print('%s%s%s%s' % (header,seq,mid,qual), end='')

        header = fastq_in.readline()

    fastq_in.close()
Esempio n. 28
0
    def _open(self, filename):
        """
        Open the input file. Set self._fp to point to it. Read the first
        line of parameters.

        @param filename: A C{str} filename containing JSON DIAMOND records.
        @raise ValueError: if the first line of the file isn't valid JSON,
            if the input file is empty, or if the JSON does not contain an
            'application' key.
        """
        if filename.endswith('.bz2'):
            if six.PY3:
                self._fp = bz2.open(filename, mode='rt', encoding='UTF-8')
            else:
                self._fp = bz2.BZ2File(filename)
        else:
            self._fp = open(filename)

        line = self._fp.readline()
        if not line:
            raise ValueError('JSON file %r was empty.' % self._filename)

        try:
            self.params = loads(line[:-1])
        except ValueError as e:
            raise ValueError(
                'Could not convert first line of %r to JSON (%s). '
                'Line is %r.' % (self._filename, e, line[:-1]))
Esempio n. 29
0
def decompress(zip_name, target_directory):
    """
    Decompresses the provided archive to the target directory. The following file extensions are supported:

    * zip
    * bz2
    * gz
    * tar
    * tar.gz
    * tgz
    * tar.bz2

    The decompression method is chosen based on the file extension.

    :param zip_name: The full path name to the file that should be decompressed.
    :param target_directory: The directory to which files should be decompressed. May or may not exist prior to calling
    this function.
    """
    path_without_extension, extension = splitext(zip_name)
    filename = basename(path_without_extension)
    if extension == ".zip":
        _do_decompress(target_directory, zipfile.ZipFile(zip_name))
    elif extension == ".bz2":
        _do_decompress_manually(target_directory, filename, bz2.open(zip_name))
    elif extension == ".gz":
        _do_decompress_manually(target_directory, filename, gzip.open(zip_name))
    elif extension in [".tar", ".tar.gz", ".tgz", ".tar.bz2"]:
        _do_decompress(target_directory, tarfile.open(zip_name))
    else:
        raise RuntimeError("Unsupported file extension [%s]. Cannot decompress [%s]" % (extension, zip_name))
Esempio n. 30
0
def open_compressed(filename,mode='rb'):
    """
    Open a file for reading with automatic decompression.  Detects gzip, xz, and
    bz2 files via the file extension.

    Arguments
    ---------
    filename to open

    Returns
    -------
    open file object

    """

    ext = filename.split('.')[-1]

    if ext == 'gz':
        import gzip
        return gzip.open(filename,mode)
    elif ext == 'xz':
        import lzma
        return lzma.open(filename,mode)
    elif ext == 'bz2':
        import bz2
        return bz2.open(filename,mode)
    else:
        return open(filename,mode)
Esempio n. 31
0
import bz2
from tqdm import tqdm

c = 0
with bz2.open("section09/enwiki-20150112-400-r100-10576.txt.bz2",
              "rt") as f, open("section09/corpus.txt", mode='wt') as g:
    for line in tqdm(f.readlines()):
        line = line.split()
        for idx in range(len(line)):
            line[idx] = line[idx].strip(".,!?;:()[]\'\"")
        if len(line) == 0:
            continue
        g.write(" ".join(line))
        g.write("\n")
Esempio n. 32
0
samplingPeriod = float(header[3]) / 1000000.0
samplingFrequency = 1 / samplingPeriod

if args.time:
    samplingPeriod = args.time / len(samples)
    print(
        f'Adjusting profile frequency {samplingFrequency:.2f} Hz to {1 / samplingPeriod:.2f} Hz'
    )
    samplingFrequency = 1 / samplingPeriod

print(
    f"Extracted {sampleCount} samples taken at {samplingFrequency:.2f} Hz (ignored {stackTraces} stack traces) for {len(samples) * samplingPeriod:.2f}s time"
)

if args.output.endswith(".bz2"):
    csvFile = bz2.open(args.output, "wt")
else:
    csvFile = open(args.output, "w")

csvFile.write('time;pc0\n')
runningTime = 0.0

for sample in samples:
    runningTime += samplingPeriod
    csvFile.write(f'{runningTime:.16f};{sample}\n')

print(f"Wrote to {args.output}")

csvFile.close()

if (not args.vmmap):
Esempio n. 33
0
 def __iter__(self):
     for fname in os.listdir(self.dirname):
         print("processing~  '{}'".format(fname))
         for line in bz2.open(os.path.join(self.dirname, fname), "rt"):
             yield sent_to_spacing_chars(line.strip()).split(splitc)
Esempio n. 34
0
    def __init__(self, volumepath, volumeid):
        '''Initializes a LoadedVolume by reading wordcounts from
        a json file. By default it reads all the pages. But if
        skip-front and skip-back are set to positive values,
        it will skip n pages.'''

        if volumepath.endswith('bz2'):
            with bz2.open(volumepath, mode='rt', encoding='utf-8') as f:
                thestring = f.read()
        else:
            with open(volumepath, encoding='utf-8') as f:
                thestring = f.read()

        thejson = json.loads(thestring)

        self.volumeid = thejson['id']

        pagedata = thejson['features']['pages']

        self.numpages = len(pagedata)
        self.pagecounts = []
        self.totalcounts = Counter()
        self.totaltokens = 0
        self.bodytokens = 0

        chunktokens = 0
        typesinthischunk = set()
        # a set of types in the current 10k-word chunk; progress
        # toward which is tracked by chunktokens

        self.integerless_pages = 0
        self.skipped_pages = 0
        compromise_pg = 0

        capitalizedbodytokens = 0

        for i in range(self.numpages):
            thispagecounts = Counter()
            thisbodytokens = 0
            thisheadertokens = 0
            thispage = pagedata[i]

            # There are really two ways of numbering pages. They come in an order,
            # which gives them an inherent ordinality (this is the *first* page). But
            # they also have cardinal *labels* attached, in the "seq" field. These labels
            # are usually, but not necessarily, convertible to integers. (Usually "00000001",
            # but could be "notes.") *Usually* they are == to the ordinal number,
            # but again, not necessarily! The world is full of fun edge cases!

            # In this loop, i is the ordinal page number, and cardinal_page is the cardinal
            # label; its value will be -1 if it can't be converted to an integer.

            # compromise_pg skips pages that have no integer seq, but otherwise
            # proceeds ordinally

            try:
                cardinal_page = int(thispage['seq'])
            except:
                cardinal_page = -1

            if cardinal_page > 0:
                compromise_pg += 1
            elif cardinal_page < 0:
                self.integerless_pages += 1

            if cardinal_page >= 0:

                bodywords = thispage['body']['tokenPosCount']
                for token, partsofspeech in bodywords.items():

                    normaltokenlist = normalize_token(token)

                    # Notice that we treat each word as a list, to permit
                    # counting both parts of a hyphenated word.
                    # But usually this will be a list of one.

                    for normaltoken in normaltokenlist:

                        for part, count in partsofspeech.items():
                            thisbodytokens += count
                            chunktokens += count
                            thispagecounts[normaltoken] += count

                headerwords = thispage['header']['tokenPosCount']
                for token, partsofspeech in headerwords.items():
                    normaltokenlist = normalize_token(token)

                    for normaltoken in normaltokenlist:
                        normaltoken = "#header" + normaltoken

                        for part, count in partsofspeech.items():
                            thisheadertokens += count
                            thispagecounts[normaltoken] += count

                # You will notice that I treat footers (mostly) as part of the body
                # Footers are rare, and rarely interesting.

                footerwords = thispage['footer']['tokenPosCount']
                for token, partsofspeech in footerwords.items():

                    normaltokenlist = normalize_token(token)

                    for normaltoken in normaltokenlist:

                        for part, count in partsofspeech.items():
                            thisbodytokens += count
                            chunktokens += count
                            thispagecounts[normaltoken] += count

                self.pagecounts.append(thispagecounts)

                for key, value in thispagecounts.items():
                    self.totalcounts[key] += value

                self.totaltokens += thisbodytokens
                self.totaltokens += thisheadertokens
                self.bodytokens += thisbodytokens

            else:
                # print(i, cardinal_page, compromise_pg)
                self.skipped_pages += 1
Esempio n. 35
0
def get_tweet_gen(tweet_fpath: str) -> Iterator:
    with bz2.open(tweet_fpath) as fbz:
        for line in fbz:
            yield json.loads(line)["tweet"]
def parse_seq_pe(opts, bc_dict, Flowcell, Lane):
    """Fastq/a-parser for PE-reads"""
    if opts.reads1.endswith('.gz'):
        seq1_handle = gzip.open(opts.reads1, "rb")
        seq2_handle = gzip.open(opts.reads2, "rb")
    elif opts.reads1.endswith('.bz2'):
        seq1_handle = bz2.open(opts.reads1, "rb")
        seq2_handle = bz2.open(opts.reads2, "rb")
    else:
        try:
            seq1_handle = open(opts.reads1, "r")
            seq2_handle = open(opts.reads2, "r")
        except IOError:
            seq1_handle = gzip.open(opts.reads1+'.gz', "rb")
            seq2_handle = gzip.open(opts.reads2+'.gz', "rb")
            opts.reads1+='.gz'

    if not opts.split:
        seq1_name = '%(code)s_%(Flowcell)s_s_%(lane)s_fastq.txt'% \
                    ({'code': 'R1_%s'%opts.output.split('/')[-2],'Flowcell':Flowcell, 'lane':Lane})
        seq2_name = '%(code)s_%(Flowcell)s_s_%(lane)s_fastq.txt'% \
                    ({'code': 'R2_%s'%opts.output.split('/')[-2],'Flowcell':Flowcell, 'lane':Lane})
        if opts.reads1.endswith('.gz'):
            seq1_name += '.gz'
            seq2_name += '.gz'
            seq1_out = gzip.open(os.path.join(opts.output, seq1_name), 'a')
            seq2_out = gzip.open(os.path.join(opts.output, seq2_name), 'a')
        else:
            seq1_out = open(os.path.join(opts.output, seq1_name), 'a')
            seq2_out = open(os.path.join(opts.output, seq2_name), 'a')
    if opts.reads1.endswith('.gz'):
        nomatch1_out= gzip.open(opts.nomatch1, "w")
        nomatch2_out= gzip.open(opts.nomatch2, "w")
    else:
        nomatch1_out= open(opts.nomatch1,  "w")
        nomatch2_out= open(opts.nomatch2, "w")
    seq = 0
    bc_set_left = set(k[0] for k in bc_dict.keys())
    bc_set_right = set(k[1] for k in bc_dict.keys())
    elements_1 = [entry.enz_remnant_R1 for entry in bc_dict.values()]
    elements_2 = [entry.enz_remnant_R2 for entry in bc_dict.values()]
    enz_sites_left = []
    enz_sites_right = []
    if opts.control_nucleotide:
        for nt in ['C','T']:
            for element in elements_1[0]:
                if nt+element[0] not in enz_sites_left:
                    #implement search which includes control nucleotide
                    enz_sites_left += [nt + element]
            for element in elements_2[0]:
                if nt+element[0] not in enz_sites_right:
                    enz_sites_right += [nt + element]
    else:
        for element in elements_1[0]:
            if element[0] not in enz_sites_left:
                # implement search which includes control nucleotide
                enz_sites_left += [element]
        for element in elements_2[0]:
            if element[0] not in enz_sites_right:
                enz_sites_right += [element]
    max_bc_len_left  =  max(k[0][0] + len(k[0][1]) for k in bc_dict.keys()) + max(len(k) for k in enz_sites_left)
    max_bc_len_right =  max(k[1][0] + len(k[1][1]) for k in bc_dict.keys()) + max(len(k) for k in enz_sites_right)
    left_read = [True]
    while left_read[0]:
        seq += 1
        left_read = []
        right_read = []
        for i in range(4):
            try:
                left_read +=  [seq1_handle.readline()]
                right_read += [seq2_handle.readline()]
            except StopIteration:
                break
        left_bc,wobble_left,left_start,control_left = levenshtein(left_read, bc_set_left,enz_sites_left, opts.mismatch, max_bc_len_left)
        right_bc,wobble_right,right_start,control_right = levenshtein(right_read, bc_set_right,enz_sites_right, opts.mismatch, max_bc_len_right)
        if left_bc and right_bc:
            #Put the correct sequence of the barcode
            try:
                bc_dict['%s_%s'%(left_bc, right_bc)+'_count'] += 1
            except KeyError:
                bc_dict['%s_%s'%(left_bc, right_bc)+'_count'] = 1
            if opts.addRG:
                #determine if read is watson or crick.
                try:
                    SM_id = bc_dict[((3,left_bc),(3,right_bc))].Sample
                except KeyError:
                    #This can only happen if the barcode is incorrectly read
                    try:
                        SM_id = bc_dict[((0, left_bc), (0, right_bc))].Sample
                    except KeyError:
                        continue
                #one control nucleotide should be converted the other not. If this succeeds than call read type (watson,crick)
                #based on left nucleotide. if this is
                if control_left != control_right:
                    strand = control_left
                else:
                    strand = control_left
                RG_id = '%s_%s_%s'%(Flowcell,Lane,SM_id)
                if wobble_left == '':
                    wobble_left = 'NNN'
                if wobble_right == '':
                    wobble_right = 'NNN'
                wobble = wobble_left + "_" + wobble_right
                left_read[0] =  left_read[0].split(' ')[0].rstrip('\n') \
                                + '\tBC:Z:%s\tBC:Z:%s\tRG:Z:%s\tST:Z:%s\n'%(left_bc, right_bc, RG_id, strand)

                right_read[0] = right_read[0].split(' ')[0].rstrip('\n') \
                                + '\tBL:Z:%s\tBR:Z:%s\tRG:Z:%s\tST:Z:%s\n'%(left_bc,right_bc, RG_id, strand)
                if opts.control_nucleotide:
                    left_read[0] = left_read[0][:-1] + '\tRN:Z:%s\n' % wobble
                    right_read[0] = right_read[0][:-1] + '\tRN:Z:%s\n' % wobble
            else:
                id = left_read[0][:-1]
            if opts.delete:
                #+1 because of control nucleotide after barcode
                if opts.control_nucleotide:
                    control_NT = 'C'
                else:
                    control_NT = ''
                left_read[1] = left_read[1][left_start + len(left_bc + control_NT):]
                left_read[3] = left_read[3][left_start + len(left_bc + control_NT):]
                right_read[1] = right_read[1][right_start + len(right_bc + control_NT):]
                right_read[3] = right_read[3][right_start + len( right_bc + control_NT):]
            if not opts.split:
                seq1_out.write(''.join(left_read))
                seq2_out.write(''.join(right_read))
            else:
                #If splitting is activated, compression takes too long, disable!
                output_location_1 = os.path.join(opts.output, "%s_%s_1.fastq"%(bc_dict[((3,left_bc),(3,right_bc))].Sample))
                output_location_2 = os.path.join(opts.output, "%s_%s_2.fastq"%(bc_dict[((3,left_bc),(3,right_bc))].Sample))
                output_handle_1 = open(output_location_1, 'a')
                output_handle_2 = open(output_location_2, 'a')
                output_handle_1.write(''.join(left_read))
                output_handle_2.write(''.join(right_read))
        else:
            #Barcode sequence was not recognized
            nomatch1_out.write(''.join(left_read))
            nomatch2_out.write(''.join(right_read))
    seq1_out.close()
    seq2_out.close()
    nomatch1_out.close()
    nomatch2_out.close()
    return bc_dict
Esempio n. 37
0
 def load(pklfile):  
     #print("Loading from {}".format(pklfile))
     with bz2.open(pklfile, "rb") as fin:
         ret = pickle.load(fin)
     return ret
Esempio n. 38
0
def convert(filepath_or_fileobj,
            dbpath,
            table,
            headerspath_or_fileobj=None,
            compression=None,
            typespath_or_fileobj=None):
    if isinstance(filepath_or_fileobj, string_types):
        if compression is None:
            fo = open(filepath_or_fileobj, mode=read_mode)
        elif compression == 'bz2':
            try:
                fo = bz2.open(filepath_or_fileobj, mode=read_mode)
            except AttributeError:
                fo = bz2.BZ2File(filepath_or_fileobj, mode='r')
        elif compression == 'gzip':
            fo = gzip.open(filepath_or_fileobj, mode=read_mode)
    else:
        fo = filepath_or_fileobj

    try:
        dialect = csv.Sniffer().sniff(fo.readline())
    except TypeError:
        dialect = csv.Sniffer().sniff(str(fo.readline()))
    fo.seek(0)

    # get the headers
    header_given = headerspath_or_fileobj is not None
    if header_given:
        if isinstance(headerspath_or_fileobj, string_types):
            ho = open(headerspath_or_fileobj, mode=read_mode)
        else:
            ho = headerspath_or_fileobj
        header_reader = csv.reader(ho, dialect)
        headers = [header.strip() for header in next(header_reader)]
        ho.close()
    else:
        reader = csv.reader(fo, dialect)
        headers = [header.strip() for header in next(reader)]
        fo.seek(0)

    # get the types
    if typespath_or_fileobj is not None:
        if isinstance(typespath_or_fileobj, string_types):
            to = open(typespath_or_fileobj, mode=read_mode)
        else:
            to = typespath_or_fileobj
        type_reader = csv.reader(to, dialect)
        types = [_type.strip() for _type in next(type_reader)]
        to.close()
    else:
        # guess types
        type_reader = csv.reader(fo, dialect)
        if not header_given: next(type_reader)
        types = _guess_types(type_reader, len(headers))
        fo.seek(0)

    # now load data
    _columns = ','.join([
        '"%s" %s' % (header, _type) for (header, _type) in zip(headers, types)
    ])

    reader = csv.reader(fo, dialect)
    if not header_given:  # Skip the header
        next(reader)

    conn = sqlite3.connect(dbpath)
    # shz: fix error with non-ASCII input
    conn.text_factory = str
    c = conn.cursor()

    try:
        create_query = 'CREATE TABLE %s (%s)' % (table, _columns)
        c.execute(create_query)
    except:
        pass

    _insert_tmpl = 'INSERT INTO %s VALUES (%s)' % (table, ','.join(
        ['?'] * len(headers)))

    line = 0
    for row in reader:
        line += 1
        if len(row) == 0:
            continue
        # we need to take out commas from int and floats for sqlite to
        # recognize them properly ...
        try:
            row = [
                None if x == '' else float(x.replace(',', ''))
                if y == 'real' else int(x) if y == 'integer' else x
                for (x, y) in zip(row, types)
            ]
            c.execute(_insert_tmpl, row)
        except ValueError as e:
            print("Unable to convert value '%s' to type '%s' on line %d" %
                  (x, y, line),
                  file=sys.stderr)
        except Exception as e:
            print("Error on line %d: %s" % (line, e), file=sys.stderr)

    conn.commit()
    c.close()
Esempio n. 39
0
    def _resolve_archive(self, filename, subpath=None):
        ext = os.path.splitext(filename)[1]
        if subpath and subpath[0] == "/":
            subpath = subpath[1:]

        if ext == ".zip":
            import zipfile
            zf = zipfile.ZipFile(filename)
            # MacOS is found guilty of adding extra files into the Zip archives
            # it creates. The files are hidden, and in the directory __MACOSX/.
            # We remove those files from the list, since they are not real user
            # files, and have an unknown binary format.
            zff = [
                name for name in zf.namelist()
                if not (name.startswith("__MACOSX/") or name.endswith("/"))
            ]
            if subpath:
                if subpath in zff:
                    zff = [subpath]
                else:
                    raise TValueError("File `%s` does not exist in archive "
                                      "`%s`" % (subpath, filename))
            if len(zff) > 1:
                warnings.warn(
                    "Zip file %s contains multiple compressed "
                    "files: %r. Only the first of them will be used." %
                    (filename, zff),
                    category=FreadWarning)
            if len(zff) == 0:
                raise TValueError("Zip file %s is empty" % filename)
            if self._verbose:
                self._logger.debug("Extracting %s to temporary directory %s" %
                                   (filename, self.tempdir))
            self._tempfiles.append(zf.extract(zff[0], path=self.tempdir))
            self._file = self._tempfiles[-1]

        elif ext == ".gz":
            import gzip
            zf = gzip.GzipFile(filename, mode="rb")
            if self._verbose:
                self._logger.debug("Extracting %s into memory" % filename)
            self._text = zf.read()
            if self._verbose:
                self._logger.debug("Extracted: size = %d" % len(self._text))

        elif ext == ".bz2":
            import bz2
            with bz2.open(filename, mode="rb") as zf:
                if self._verbose:
                    self._logger.debug("Extracting %s into memory" % filename)
                self._text = zf.read()
                if self._verbose:
                    self._logger.debug("Extracted: size = %d" %
                                       len(self._text))

        elif ext == ".xz":
            import lzma
            with lzma.open(filename, mode="rb") as zf:
                if self._verbose:
                    self._logger.debug("Extracting %s into memory" % filename)
                self._text = zf.read()
                if self._verbose:
                    self._logger.debug("Extracted: size = %d" %
                                       len(self._text))

        elif ext == ".xlsx" or ext == ".xls":
            self._result = read_xls_workbook(filename, subpath)

        else:
            self._file = filename
Esempio n. 40
0
# Сжатие с помощью gzip
import gzip
with gzip.open('somefile.gz', 'rt') as f:
 text = f.read()
# Сжатие с помощью bz2
import bz2
with bz2.open('somefile.bz2', 'rt') as f:
 text = f.read()

Как показано выше, весь ввод и вывод будет использовать текст и проводить 
кодирование/декодирование в Unicode. Если же вы хотите работать с бинарными 
данными, используйте файловый режим rb или wb.

При записи сжатых данных с помощью необязательного именованного аргу-
мента compresslevel может быть установлен уровень компрессии. Например:

with gzip.open('somefile.gz', 'wt', compresslevel=5) as f:
 f.write(text)

Уровень по умолчанию – это 9, то есть наивысший. Более низкие уровни увели-
чивают скорость, но снижают степень сжатия данных.
Esempio n. 41
0
def do_bz2_open(user_input, size=0):
    try:
        with bz2.open(user_input) as bz2file:
            return bz2file.read(size)
    except Exception:
        return None
Esempio n. 42
0
def threadfun(startf, endf, pid):
    ## Individual and Co-Occurence Counts
    print("Started Process : {} , PID :{}, Startf:{}  Endf: {}".format(
        pid, os.getpid(), startf, endf))
    print("Loading Individual count files..")
    with open('CNG_count.pickle', 'rb') as handle:
        CNG_count = pickle.load(handle)
    with open('CNGG_count.pickle', 'rb') as handle:
        CNGG_count = pickle.load(handle)
    with open('Word_count.pickle', 'rb') as handle:
        Word_count = pickle.load(handle)
    with open('Lemma_count.pickle', 'rb') as handle:
        Lemma_count = pickle.load(handle)

    def from_dict(Type_1, Type_2):
        with open(Type_1 + '|' + Type_2 + '.json', 'r') as fp:
            d = json.load(fp)
        return d

    print("Loading Co-Occurence count files...")
    CNG_Distinct = len(CNG_count.keys())
    graph = {
        'LemmaLemma': from_dict('Lemma', 'Lemma'),
        'LemmaWord': from_dict('Lemma', 'Word'),
        'LemmaCNG': from_dict('Lemma', 'CNG'),
        'LemmaCNG_Group': from_dict('Lemma', 'CNG_Group'),
        'WordLemma': from_dict('Word', 'Lemma'),
        'WordWord': from_dict('Word', 'Word'),
        'WordCNG': from_dict('Word', 'CNG'),
        'WordCNG_Group': from_dict('Word', 'CNG_Group'),
        'CNGLemma': from_dict('CNG', 'Lemma'),
        'CNGWord': from_dict('CNG', 'Word'),
        'CNGCNG': from_dict('CNG', 'CNG'),
        'CNGCNG_Group': from_dict('CNG', 'CNG_Group'),
        'CNG_GroupLemma': from_dict('CNG_Group', 'Lemma'),
        'CNG_GroupWord': from_dict('CNG_Group', 'Word'),
        'CNG_GroupCNG': from_dict('CNG_Group', 'CNG'),
        'CNG_GroupCNG_Group': from_dict('CNG_Group', 'CNG_Group')
    }
    savedir = 'features/'

    ## Reading Metapaths
    # metapaths = []
    # with open('feature_ranklist_BM2_t2.txt','r') as file:
    #     rd = file.readlines()
    #     for row in rd:
    #         metapaths.append(row.split(',')[1])

    # print(len(metapaths))

    # Reading Metapaths
    df = pd.read_csv("featureStats.csv")
    metapaths = list(df[df["p2_4K_bigram_mir"] == 1]['FeatureName'])
    print(len(metapaths))

    ##Some utility functions
    def checktype(el):
        if (el.lstrip("-").isdigit()):
            return "CNG"
        elif (el == 'C'):
            return 'C'
        elif (el == 'T'):
            return "T"
        elif (el == 'L'):
            return "L"
        else:
            return "CNG_Group"

    def denfun(el, eltype):
        if (eltype == 'CNG' or eltype == 'C'):
            return CNG_count.get(int(el), 0)
        elif (eltype == 'L'):
            return Lemma_count.get(el, 0)
        elif (eltype == 'W'):
            return Word_count.get(el, 0)
        else:
            return CNGG_count.get(el, 0)

    def changetype(typ):
        if (typ == 'L'):
            return "Lemma"
        elif (typ == 'C'):
            return "CNG"
        elif (typ == 'T'):
            return "Word"
        else:
            return typ

    ##Actual Work Starts here
    gdir = 'After_graphml/'
    x = os.listdir(gdir)
    x.sort()
    fc = 0

    print("Started Iterating over files :{} - {}".format(startf, endf))

    for gfile in x[startf:endf]:
        ##iterating over 119k files
        try:
            G = read_graphml(gdir + gfile)
            cur = []
            for i in range(1 + G.number_of_nodes()):
                cur.append([])
                for j in range(1 + G.number_of_nodes()):
                    cur[i].append(0)

            glemma = nx.get_node_attributes(G, 'lemma')
            gword = nx.get_node_attributes(G, 'word')
            gcng = nx.get_node_attributes(G, 'cng')
            ec = 0
            for snode, enode, d in G.edges_iter(data=True):
                ##iterating over all edges
                # print(snode,enode)
                ar = np.zeros(1500)
                r = 0
                c = 0
                w = 0
                l = 0
                g = 0
                o = 0
                for row in metapaths:
                    ##iterating over 1500 metapaths
                    row = row.split('*')
                    if (len(row) == 2):
                        node1 = row[0]
                        type1 = checktype(node1)
                        if (type1 == 'T'):
                            node1 = glemma[snode] + '_' + str(gcng[snode])
                        elif (type1 == 'L'):
                            node1 = glemma[snode]
                        elif (type1 == 'C'):
                            node1 = gcng[snode]
                        den1 = denfun(node1, type1)
                        node2 = row[1]
                        type2 = checktype(node2)
                        if (type2 == 'T'):
                            node2 = glemma[enode] + '_' + str(gcng[enode])
                        elif (type2 == 'L'):
                            node2 = glemma[enode]
                        elif (type2 == 'C'):
                            node2 = gcng[enode]
                        type1 = changetype(type1)
                        type2 = changetype(type2)
                        type12 = type1 + type2
                        num12 = graph[type12].get(
                            str(node1) + '|' + str(node2), 0)
                        prob12 = (float(num12) + 1) / (den1 + CNG_Distinct)
                        prob = prob12

                    elif (len(row) == 3):
                        node1 = row[0]
                        type1 = checktype(node1)
                        if (type1 == 'T'):
                            node1 = glemma[snode] + '_' + str(gcng[snode])
                        elif (type1 == 'L'):
                            node1 = glemma[snode]
                        elif (type1 == 'C'):
                            node1 = gcng[snode]
                        den1 = denfun(node1, type1)
                        node2 = row[1]
                        type2 = checktype(node2)
                        den2 = denfun(node2, type2)
                        node3 = row[2]
                        type3 = checktype(node3)
                        if (type3 == 'T'):
                            node3 = glemma[enode] + '_' + str(gcng[enode])
                        elif (type3 == 'L'):
                            node3 = glemma[enode]
                        elif (type3 == 'C'):
                            node3 = gcng[enode]
                        type1 = changetype(type1)
                        type2 = changetype(type2)
                        type3 = changetype(type3)
                        type12 = type1 + type2
                        type23 = type2 + type3

                        num12 = graph[type12].get(
                            str(node1) + '|' + str(node2), 0)
                        num23 = graph[type23].get(
                            str(node2) + '|' + str(node3), 0)
                        prob12 = (float(num12) + 1) / (den1 + CNG_Distinct)
                        prob23 = (float(num23) + 1) / (den2 + CNG_Distinct)
                        prob = prob12 * prob23

                    elif (len(row) == 4):
                        node1 = row[0]
                        type1 = checktype(node1)
                        if (type1 == 'T'):
                            node1 = glemma[snode] + '_' + str(gcng[snode])
                        elif (type1 == 'L'):
                            node1 = glemma[snode]
                        elif (type1 == 'C'):
                            node1 = gcng[snode]
                        den1 = denfun(node1, type1)
                        node2 = row[1]
                        type2 = checktype(node2)
                        den2 = denfun(node2, type2)
                        node3 = row[2]
                        type3 = checktype(node3)
                        den3 = denfun(node3, type3)
                        node4 = row[3]
                        type4 = checktype(node4)
                        if (type4 == 'T'):
                            node4 = glemma[enode] + '_' + str(gcng[enode])
                        elif (type4 == 'L'):
                            node4 = glemma[enode]
                        elif (type4 == 'C'):
                            node4 = gcng[snode]
                        den4 = denfun(node4, type4)
                        type1 = changetype(type1)
                        type2 = changetype(type2)
                        type3 = changetype(type3)
                        type4 = changetype(type4)
                        type12 = type1 + type2
                        type23 = type2 + type3
                        type34 = type3 + type4
                        num12 = graph[type12].get(
                            str(node1) + '|' + str(node2), 0)
                        num23 = graph[type23].get(
                            str(node2) + '|' + str(node3), 0)
                        num34 = graph[type34].get(
                            str(node3) + '|' + str(node4), 0)
                        prob12 = (float(num12) + 1) / (den1 + CNG_Distinct)
                        prob23 = (float(num23) + 1) / (den2 + CNG_Distinct)
                        prob34 = (float(num34) + 1) / (den3 + CNG_Distinct)
                        prob = prob12 * prob23 * prob34
                    else:
                        print("Invalid Metapath length")
                    ar[r] = prob
                    r += 1

                cur[int(snode)][int(enode)] = ar
            fc += 1
            print("File Number :{}; pid: {}".format(fc, pid))
            print("fine till here")
            with bz2.open(
                    str(savedir) + str(gfile.split(".graphml")[0]) + '.bz2',
                    'wb') as f:
                pickle.dump(cur, f)

            # with bz2.open(str(savedir)+str(gfile.split(".graphml")[0])+'.bz2', 'rb') as f:
            # 	y = pickle.load(f)

        except Exception as e:
            print(e)
            print("Error at file :{}".format(str(gfile)))
            continue
    print("All Done for pid :{}".format(pid))


# threadfun(0,1000,1)
Esempio n. 43
0
 def write_bz2(self, data):
     with bz2.open(self.path, 'wb') as f:
         f.write(data)
Esempio n. 44
0
            print('uvloop is not installed!')
            exit(1)

    if not cargs.stateless:
        # Logging starts here
        # Create directory for logs if it doesn't exist
        if not os.path.exists('logs'):
            os.mkdir('logs')

        # Compress logfiles that were left over from the last run
        os.chdir('logs')
        if not os.path.exists('old'):
            os.mkdir('old')
        for item in os.listdir('.'):
            if item.endswith('.log'):
                with bz2.open(item + '.bz2', 'w') as f:
                    f.write(open(item, 'rb').read())
                os.remove(item)
        for item in os.listdir('.'):
            if item.endswith('.gz') or item.endswith('.bz2'):
                os.rename(item, 'old/' + item)
        os.chdir('..')

    # Define a format
    now = str(datetime.datetime.now()).replace(' ',
                                               '_').replace(':',
                                                            '-').split('.')[0]
    formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')

    # Setting up loggers
    logger = logging.getLogger('liara')
Esempio n. 45
0
def xopen(filename, mode='r'):
    """
	Replacement for the "open" function that can also open files that have
	been compressed with gzip, bzip2 or xz. If the filename is '-', standard
	output (mode 'w') or input (mode 'r') is returned. If the filename ends
	with .gz, the file is opened with a pipe to the gzip program. If that
	does not work, then gzip.open() is used (the gzip module is slower than
	the pipe to the gzip program). If the filename ends with .bz2, it's
	opened as a bz2.BZ2File. Otherwise, the regular open() is used.

	mode can be: 'rt', 'rb', 'at', 'ab', 'wt', or 'wb'
	Instead of 'rt', 'wt' and 'at', 'r', 'w' and 'a' can be used as
	abbreviations.

	In Python 2, the 't' and 'b' characters are ignored.

	Append mode ('a', 'at', 'ab') is unavailable with BZ2 compression and
	will raise an error.
	"""
    if mode in ('r', 'w', 'a'):
        mode += 't'
    if mode not in ('rt', 'rb', 'wt', 'wb', 'at', 'ab'):
        raise ValueError("mode '{0}' not supported".format(mode))
    if not _PY3:
        mode = mode[0]
    if not isinstance(filename, basestring):
        raise ValueError("the filename must be a string")

    # standard input and standard output handling
    if filename == '-':
        return dict(r=sys.stdin,
                    rt=sys.stdin,
                    rb=sys.stdin.buffer,
                    w=sys.stdout,
                    wt=sys.stdout,
                    wb=sys.stdout.buffer)[mode]

    if filename.endswith('.bz2'):
        if bz2 is None:
            raise ImportError(
                "Cannot open bz2 files: The bz2 module is not available")
        if _PY3:
            return bz2.open(filename, mode)
        else:
            if mode[0] == 'a':
                raise ValueError(
                    "mode '{0}' not supported with BZ2 compression".format(
                        mode))
            if sys.version_info[:2] <= (2, 6):
                return ClosingBZ2File(filename, mode)
            else:
                return bz2.BZ2File(filename, mode)
    elif filename.endswith('.xz'):
        if lzma is None:
            raise ImportError(
                "Cannot open xz files: The lzma module is not available (use Python 3.3 or newer)"
            )
        return lzma.open(filename, mode)
    elif filename.endswith('.gz'):
        if _PY3:
            if 't' in mode:
                # gzip.open in Python 3.2 does not support modes 'rt' and 'wt''
                if sys.version_info > (3, 3):
                    return gzip.open(filename, mode)
                else:
                    return io.TextIOWrapper(gzip.open(filename, mode[0]))
            else:
                if 'r' in mode:
                    return io.BufferedReader(gzip.open(filename, mode))
                else:
                    return io.BufferedWriter(gzip.open(filename, mode))
        else:
            # rb/rt are equivalent in Py2
            if 'r' in mode:
                try:
                    return PipedGzipReader(filename)
                except OSError:
                    # gzip not installed
                    return buffered_reader(gzip.open(filename, mode))
            else:
                try:
                    return PipedGzipWriter(filename, mode)
                except OSError:
                    return buffered_writer(gzip.open(filename, mode))
    else:
        return open(filename, mode)
Esempio n. 46
0
 def read_bz2(self):
     with bz2.open(self.path, 'rb') as f:
         return f.read()
Esempio n. 47
0
def detect_archive_format_and_open(path):
    if path.endswith(".bz2"):
        return bz2.open(path, mode='rt')
    if path.endswith(".gz"):
        return gzip.open(path, mode='rt')
    return open(path)
Esempio n. 48
0
 def read(self):
     with bz2.open(self.path) as f:
         return [json.loads(l) for l in f.readlines()]
Esempio n. 49
0
# creating prbe_id list of EU countries and are hosting
prb_ID_EU = pd.read_csv("m_AS_EU_hosting.csv")
prb_id_list = list(prb_ID_EU["prb_id"])

#%%

# asking for user to provide filenam to work with
filename = input("enter the filename you want to work with")
print("name of file you entered is", filename)
bz2Filename = str(filename)

# counting total number of lines and the time taken for reading the lines
## fname should be like "ping-2020-02-20T0000.bz2"
starttimereadinglines = time.time()
bz2File = bz2.open(bz2Filename, 'rt')
count_nrlines = 0
for line in bz2File:
    count_nrlines += 1
    if count_nrlines > 10000:  ## todo acts as counter to check
        break
print("Total number of lines is:", count_nrlines)
# closinf file
bz2File.close()
# creating end time variable
endtimereadingfiles = time.time()
# printing total times for reading lines
print("total times for reading ", str(count_nrlines), "lines is",
      endtimereadingfiles - starttimereadinglines, "in seconds")

#%%
Esempio n. 50
0
def open_files():
    '''
    Goes through the directory containing all the data files.
    '''
    #path = os.path.expanduser('/data/files.pushshift.io/reddit/submissions')
    os.chdir('/data/files.pushshift.io/reddit/submissions')
    #files = [f for f in os.listdir(path)] #issue with RS_2011-01.bz2 having some non unicode-32 characters.
    #files = ['RS_2017-11.bz2','RS_2017-10.bz2','RS_2017-08.bz2','RS_2017-07.bz2','RS_2017-06.bz2','RS_2017-05.bz2','RS_2017-04.bz2']
    #files = ['RS_2011-01.bz2', 'RS_2012-01.bz2','RS_2013-01.bz2','RS_2014-01.bz2','RS_2015-01.gz','RS_2016-01.gz','RS_2017-01.bz2','RS_2018-01.xz','RS_2019-01.gz']
    #files = ['RS_2012-01.bz2','RS_2013-01.bz2','RS_2014-01.bz2','RS_2015-01.gz','RS_2016-01.gz','RS_2017-01.bz2','RS_2018-01.xz','RS_2019-01.gz']
    files = ['RS_2011-01.bz2']
    for i in files:
        year = i[3:7]
        if year == '2011':
            with open("/home/bmountain/dm_project/output_2011.json", "r+") as json_date_file:
                data = json.load(json_date_file)
                with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master:
                    master_data = json.load(json_master)
                    with bz2.open(i, "r") as content:
                        print(datetime.datetime.now(), 'opening ' + i)
                        for line in content:
                            try:
                                post = json.loads(line)
                                sub = post.get("subreddit")
                                if sub in subreddit_list:
                                    if post.get("score") > 10: # arbitrary threshold
                                        log_normalized_score = math.log(post.get("score")) * 1.0
                                        if sub in data:
                                            data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in data:
                                            data[sub] = [[post.get("title"), log_normalized_score]]
                                        if sub in master_data:
                                            master_data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in master_data:
                                            master_data[sub] = [[post.get("title"), log_normalized_score]]
                            except:
                                pass
                with open("/home/bmountain/dm_project/output_master.json", "w") as master_write:
                    json.dump(master_data,master_write)
            with open("/home/bmountain/dm_project/output_2011.json","w") as j_file:
                json.dump(data,j_file)
        if year == '2012':
            with open("/home/bmountain/dm_project/output_2012.json", "r+") as json_date_file:
                data = json.load(json_date_file)
                with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master:
                    master_data = json.load(json_master)
                    with bz2.open(i, "r") as content:
                        print(datetime.datetime.now(), 'opening ' + i)
                        for line in content:
                            try:
                                post = json.loads(line)
                                sub = post.get("subreddit")
                                if sub in subreddit_list:
                                    if post.get("score") > 10: # arbitrary threshold
                                        log_normalized_score = math.log(post.get("score")) * 1.0
                                        if sub in data:
                                            data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in data:
                                            data[sub] = [[post.get("title"), log_normalized_score]]
                                        if sub in master_data:
                                            master_data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in master_data:
                                            master_data[sub] = [[post.get("title"), log_normalized_score]]
                            except:
                                pass
                with open("/home/bmountain/dm_project/output_master.json", "w") as master_write:
                    json.dump(master_data,master_write)
            with open("/home/bmountain/dm_project/output_2012.json","w") as j_file:
                json.dump(data,j_file)
        if year == '2013':
            with open("/home/bmountain/dm_project/output_2013.json", "r+") as json_date_file:
                data = json.load(json_date_file)
                with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master:
                    master_data = json.load(json_master)
                    with bz2.open(i, "r") as content:
                        print(datetime.datetime.now(), 'opening ' + i)
                        for line in content:
                            try:
                                post = json.loads(line)
                                sub = post.get("subreddit")
                                if sub in subreddit_list:
                                    if post.get("score") > 10: # arbitrary threshold
                                        log_normalized_score = math.log(post.get("score")) * 1.0
                                        if sub in data:
                                            data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in data:
                                            data[sub] = [[post.get("title"), log_normalized_score]]
                                        if sub in master_data:
                                            master_data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in master_data:
                                            master_data[sub] = [[post.get("title"), log_normalized_score]]
                            except:
                                pass
                with open("/home/bmountain/dm_project/output_master.json", "w") as master_write:
                    json.dump(master_data,master_write)
            with open("/home/bmountain/dm_project/output_2013.json","w") as j_file:
                json.dump(data,j_file)
        if year == '2014':
            with open("/home/bmountain/dm_project/output_2014.json", "r+") as json_date_file:
                data = json.load(json_date_file)
                with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master:
                    master_data = json.load(json_master)
                    with bz2.open(i, "r") as content:
                        print(datetime.datetime.now(), 'opening ' + i)
                        for line in content:
                            try:
                                post = json.loads(line)
                                sub = post.get("subreddit")
                                if sub in subreddit_list:
                                    if post.get("score") > 10: # arbitrary threshold
                                        log_normalized_score = math.log(post.get("score")) * 1.0
                                        if sub in data:
                                            data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in data:
                                            data[sub] = [[post.get("title"), log_normalized_score]]
                                        if sub in master_data:
                                            master_data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in master_data:
                                            master_data[sub] = [[post.get("title"), log_normalized_score]]
                            except:
                                pass
                with open("/home/bmountain/dm_project/output_master.json", "w") as master_write:
                    json.dump(master_data,master_write)
            with open("/home/bmountain/dm_project/output_2014.json","w") as j_file:
                json.dump(data,j_file)
        if year == '2015':
            with open("/home/bmountain/dm_project/output_2015.json", "r+") as json_date_file:
                data = json.load(json_date_file)
                with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master:
                    master_data = json.load(json_master)
                    with gzip.open(i) as content:
                        print(datetime.datetime.now(), 'opening ' + i)
                        for line in content:
                            try:
                                post = json.loads(line)
                                sub = post.get("subreddit")
                                if sub in subreddit_list:
                                    if post.get("score") > 10: # arbitrary threshold
                                        log_normalized_score = math.log(post.get("score")) * 1.0
                                        if sub in data:
                                            data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in data:
                                            data[sub] = [[post.get("title"), log_normalized_score]]
                                        if sub in master_data:
                                            master_data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in master_data:
                                            master_data[sub] = [[post.get("title"), log_normalized_score]]
                            except:
                                pass
                with open("/home/bmountain/dm_project/output_master.json", "w") as master_write:
                    json.dump(master_data,master_write)
            with open("/home/bmountain/dm_project/output_2015.json","w") as j_file:
                json.dump(data,j_file)
        if year == '2016':
            with open("/home/bmountain/dm_project/output_2016.json", "r+") as json_date_file:
                data = json.load(json_date_file)
                with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master:
                    master_data = json.load(json_master)
                    with gzip.open(i) as content:
                        print(datetime.datetime.now(), 'opening ' + i)
                        for line in content:
                            try:
                                post = json.loads(line)
                                sub = post.get("subreddit")
                                if sub in subreddit_list:
                                    if post.get("score") > 10: # arbitrary threshold
                                        log_normalized_score = math.log(post.get("score")) * 1.0
                                        if sub in data:
                                            data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in data:
                                            data[sub] = [[post.get("title"), log_normalized_score]]
                                        if sub in master_data:
                                            master_data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in master_data:
                                            master_data[sub] = [[post.get("title"), log_normalized_score]]
                            except:
                                pass
                with open("/home/bmountain/dm_project/output_master.json", "w") as master_write:
                    json.dump(master_data,master_write)
            with open("/home/bmountain/dm_project/output_2016.json","w") as j_file:
                json.dump(data,j_file)
        if year == '2017':
            with open("/home/bmountain/dm_project/output_2017.json", "r+") as json_date_file:
                data = json.load(json_date_file)
                with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master:
                    master_data = json.load(json_master)
                    with bz2.open(i, "r") as content:
                        print(datetime.datetime.now(), 'opening ' + i)
                        for line in content:
                            try:
                                post = json.loads(line)
                                sub = post.get("subreddit")
                                if sub in subreddit_list:
                                    if post.get("score") > 10: # arbitrary threshold
                                        log_normalized_score = math.log(post.get("score")) * 1.0
                                        if sub in data:
                                            data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in data:
                                            data[sub] = [[post.get("title"), log_normalized_score]]
                                        if sub in master_data:
                                            master_data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in master_data:
                                            master_data[sub] = [[post.get("title"), log_normalized_score]]
                            except:
                                pass
                with open("/home/bmountain/dm_project/output_master.json", "w") as master_write:
                    json.dump(master_data,master_write)
            with open("/home/bmountain/dm_project/output_2017.json","w") as j_file:
                json.dump(data,j_file)
        if year == '2018':
            with open("/home/bmountain/dm_project/output_2018.json", "r+") as json_date_file:
                data = json.load(json_date_file)
                with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master:
                    master_data = json.load(json_master)
                    with lzma.open(i, mode='rt') as content:
                        print(datetime.datetime.now(), 'opening ' + i)
                        for line in content:
                            try:
                                post = json.loads(line)
                                sub = post.get("subreddit")
                                if sub in subreddit_list:
                                    if post.get("score") > 10: # arbitrary threshold
                                        log_normalized_score = math.log(post.get("score")) * 1.0
                                        if sub in data:
                                            data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in data:
                                            data[sub] = [[post.get("title"), log_normalized_score]]
                                        if sub in master_data:
                                            master_data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in master_data:
                                            master_data[sub] = [[post.get("title"), log_normalized_score]]
                            except:
                                pass
                with open("/home/bmountain/dm_project/output_master.json", "w") as master_write:
                    json.dump(master_data,master_write)
            with open("/home/bmountain/dm_project/output_2018.json","w") as j_file:
                json.dump(data,j_file)
        if year == '2019':
            with open("/home/bmountain/dm_project/output_2019.json", "r+") as json_date_file:
                data = json.load(json_date_file)
                with open("/home/bmountain/dm_project/output_master.json", "r+") as json_master:
                    master_data = json.load(json_master)
                    with gzip.open(i) as content:
                        print(datetime.datetime.now(), 'opening ' + i)
                        for line in content:
                            try:
                                post = json.loads(line)
                                sub = post.get("subreddit")
                                if sub in subreddit_list:
                                    if post.get("score") > 10: # arbitrary threshold
                                        log_normalized_score = (math.log(post.get("score")) * 1.0)
                                        if sub in data:
                                            data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in data:
                                            data[sub] = [[post.get("title"), log_normalized_score]]
                                        if sub in master_data:
                                            master_data[sub].append([post.get("title"), log_normalized_score])
                                        if sub not in master_data:
                                            master_data[sub] = [[post.get("title"), log_normalized_score]]
                            except:
                                pass
                with open("/home/bmountain/dm_project/output_master.json", "w") as master_write:
                    json.dump(master_data,master_write)
            with open("/home/bmountain/dm_project/output_2019.json","w") as j_file:
                json.dump(data,j_file)
Esempio n. 51
0
# Load model, feature extractor
model = keras.applications.VGG16(weights='imagenet', include_top=True)

targetSize = model.input_shape[1:3]
print("Target size: %s x %s" % targetSize)

feat_extractor = Model(inputs=model.input,
                       outputs=model.get_layer("fc2").output)

print("Extracting features from each image...")
features = np.zeros((fileCount, 4096), dtype=np.float32)
for i, fn in enumerate(files):
    im = image.load_img(fn, target_size=targetSize)
    x = image.img_to_array(im)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    feat = feat_extractor.predict(x)[0]
    features[i] = feat
    printProgress(i + 1, fileCount)

print("Reducing feature vectors down to %s features..." % a.PCA_COMPONENTS)
pca = PCA(n_components=a.PCA_COMPONENTS)
pca.fit(features)
pca_features = pca.transform(features)

print("Saving features file %s..." % a.OUTPUT_FILE)
makeDir(a.OUTPUT_FILE)
pickle.dump(pca_features, bz2.open(a.OUTPUT_FILE, 'wb'))
print("Done.")
Esempio n. 52
0
#!/usr/bin/env python3
# encoding: utf-8
#
# Copyright (c) 2015 Doug Hellmann All rights reserved.
#
"""Write and read unicode data to a file.
"""
#end_pymotw_header

import bz2
import os

data = 'Character with an åccent.'

with bz2.open('example.bz2', 'wt', encoding='utf-8') as output:
    output.write(data)

with bz2.open('example.bz2', 'rt', encoding='utf-8') as input:
    print('Full file: {}'.format(input.read()))

# Move to the beginning of the accented character.
with bz2.open('example.bz2', 'rt', encoding='utf-8') as input:
    input.seek(18)
    print('One character: {}'.format(input.read(1)))

# Move to the middle of the accented character.
with bz2.open('example.bz2', 'rt', encoding='utf-8') as input:
    input.seek(19)
    try:
        print(input.read(1))
    except UnicodeDecodeError:
Esempio n. 53
0
def _open_bz2(filename, mode: str) -> IO:
    return bz2.open(filename, mode)
Esempio n. 54
0
    args = parser.parse_args()

    # Load MM corpus and dictionary
    corpus = load_mm_corpus(args.mm_fname)
    dict = gensim.corpora.Dictionary.load(args.dict_fname)
    with open(args.categories_fname, 'rb') as categories_file:
        categories = pickle.load(categories_file)

    prepared_query_funcs = {}
    for name, search_query in ALL_SEARCH_QUERIES.items():
        prepared_query_funcs[name] = search_query(corpus, dict, categories)

    query_funcs = [
        eval(query_func, prepared_query_funcs)
        for query_func in args.query_funcs
    ]

    logging.info('Exploring %s with functions %s', args.wiki_dump_fname,
                 ', '.join(map(str, args.query_funcs)))
    with bz2.open(args.wiki_dump_fname, 'rt') as wiki_dump_file:
        with open(args.results_fname, 'w') as results_file:
            results = csv.writer(results_file)
            # Write header row
            results.writerow(['Title'] + args.query_funcs)
            for title, content, pageid in \
                    wikicorpus.extract_pages(wiki_dump_file,
                                             filter_namespaces=('0',)):
                results.writerow(
                    [title] +
                    [query_func(title, content) for query_func in query_funcs])
Esempio n. 55
0
def file_type(f_name):
    if f_name[-4:] == '.bz2':
        reader = bz2.open(f_name, "rt")
        return reader
    return open(f_name, "r")
Esempio n. 56
0
#!/usr/bin/env python3

import os, sys, bz2, csv, re, json, sqlite3
import urllib.request

import pprint

myUA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:58.0) Gecko/20100101 Firefox/58.0'
conn = sqlite3.connect('giga.authors.sqlite')
conn.execute(
    'CREATE TABLE PubDat (DOI TEXT, Title TEXT, Type TEXT, Authors TEXT, RefList TEXT)'
)

with bz2.open('giga.tsv.bz2', 'rt', encoding='utf-8') as tsvin:
    tsvin = csv.reader(tsvin, delimiter='\t')
    for row in tsvin:
        if row[0] == 'Title' or len(row) == 0: continue
        print((len(row), row[0]))
        theurl = 'https://academic.oup.com/gigascience/article-lookup/doi/' + row[
            16]
        req = urllib.request.Request(theurl)
        req.add_header('Referer', 'https://academic.oup.com/gigascience/')
        req.add_header('User-Agent', myUA)
        with urllib.request.urlopen(req) as r:
            htm = r.read().decode('utf-8')
        it = iter(htm.split('\n'))
        data = {'strAuthors': 'NA', 'reflist': 'NA', 'tocSections': 'NA'}
        for line, secline in zip(it, it):
            if re.search('<script type="application\/ld\+json">', line):
                datAuthors = json.loads(secline)
                data['strAuthors'] = json.dumps(datAuthors['author'])
Esempio n. 57
0
    def open(cls,
             column_names: typing.List[str],
             file_path: typing.Optional[Path],
             who: str = "output",
             require_all_columns: bool = True,
             prohibit_extra_columns: bool = True,
             fill_missing_columns: bool = False,
             error_file: typing.TextIO = sys.stderr,
             header_error_action: ValidationAction = ValidationAction.EXIT,
             use_mgzip: bool = False,
             mgzip_threads: int = MGZIP_THREAD_COUNT_DEFAULT,
             gzip_in_parallel: bool = False,
             gzip_queue_size: int = GZIP_QUEUE_SIZE_DEFAULT,
             column_separator: str = KgtkFormat.COLUMN_SEPARATOR,
             mode: Mode = Mode.AUTO,
             output_format: typing.Optional[str] = None,
             output_column_names: typing.Optional[typing.List[str]] = None,
             old_column_names: typing.Optional[typing.List[str]] = None,
             new_column_names: typing.Optional[typing.List[str]] = None,
             verbose: bool = False,
             very_verbose: bool = False)->"KgtkWriter":

        if file_path is None or str(file_path) == "-":
            if verbose:
                print("KgtkWriter: writing stdout", file=error_file, flush=True)

            if output_format is None:
                output_format = cls.OUTPUT_FORMAT_DEFAULT

            return cls._setup(column_names=column_names,
                              file_path=None,
                              who=who,
                              file_out=sys.stdout,
                              require_all_columns=require_all_columns,
                              prohibit_extra_columns=prohibit_extra_columns,
                              fill_missing_columns=fill_missing_columns,
                              error_file=error_file,
                              header_error_action=header_error_action,
                              use_mgzip=use_mgzip,
                              mgzip_threads=mgzip_threads,
                              gzip_in_parallel=gzip_in_parallel,
                              gzip_queue_size=gzip_queue_size,
                              column_separator=column_separator,
                              mode=mode,
                              output_format=output_format,
                              output_column_names=output_column_names,
                              old_column_names=old_column_names,
                              new_column_names=new_column_names,
                              verbose=verbose,
                              very_verbose=very_verbose,
            )
        
        if str(file_path).startswith(">"):
            fd: int = int(str(file_path)[1:])
            if verbose:
                print("%s: writing file descriptor %d" % (who, fd), file=error_file, flush=True)

            if output_format is None:
                output_format = cls.OUTPUT_FORMAT_DEFAULT

            return cls._setup(column_names=column_names,
                              file_path=file_path,
                              who=who,
                              file_out=open(fd, "w"),
                              require_all_columns=require_all_columns,
                              prohibit_extra_columns=prohibit_extra_columns,
                              fill_missing_columns=fill_missing_columns,
                              error_file=error_file,
                              header_error_action=header_error_action,
                              use_mgzip=use_mgzip,
                              mgzip_threads=mgzip_threads,
                              gzip_in_parallel=gzip_in_parallel,
                              gzip_queue_size=gzip_queue_size,
                              column_separator=column_separator,
                              mode=mode,
                              output_format=output_format,
                              output_column_names=output_column_names,
                              old_column_names=old_column_names,
                              new_column_names=new_column_names,
                              verbose=verbose,
                              very_verbose=very_verbose,
            )
                

        if verbose:
            print("File_path.suffix: %s" % file_path.suffix, file=error_file, flush=True)

        if file_path.suffix in [".gz", ".bz2", ".xz", ".lz4"]:
            # TODO: find a better way to coerce typing.IO[Any] to typing.TextIO
            gzip_file: typing.TextIO
            if file_path.suffix == ".gz":
                if use_mgzip:
                    if verbose:
                        print("KgtkWriter: writing gzip with %d threads: %s" % (mgzip_threads, str(file_path)), file=error_file, flush=True)
                    import mgzip
                    gzip_file = mgzip.open(str(file_path), mode="wt", thread=mgzip_threads) # type: ignore
                else:
                    if verbose:
                        print("KgtkWriter: writing gzip %s" % str(file_path), file=error_file, flush=True)
                    import gzip
                    gzip_file = gzip.open(file_path, mode="wt") # type: ignore

            elif file_path.suffix == ".bz2":
                if verbose:
                    print("KgtkWriter: writing bz2 %s" % str(file_path), file=error_file, flush=True)
                import bz2
                gzip_file = bz2.open(file_path, mode="wt") # type: ignore

            elif file_path.suffix == ".xz":
                if verbose:
                    print("KgtkWriter: writing lzma %s" % str(file_path), file=error_file, flush=True)
                import lzma
                gzip_file = lzma.open(file_path, mode="wt") # type: ignore

            elif file_path.suffix ==".lz4":
                if verbose:
                    print("KgtkWriter: writing lz4 %s" % str(file_path), file=error_file, flush=True)
                import lz4 # type: ignore
                gzip_file = lz4.frame.open(file_or_path, mode="wt") # type: ignore
            else:
                # TODO: throw a better exception.
                raise ValueError("Unexpected file_path.suffiz = '%s'" % file_path.suffix)

            if output_format is None:
                if len(file_path.suffixes) < 2:
                    output_format = cls.OUTPUT_FORMAT_DEFAULT
                else:
                    format_suffix: str = file_path.suffixes[-2]
                    if format_suffix == ".md":
                        output_format = cls.OUTPUT_FORMAT_MD
                    elif format_suffix == ".csv":
                        output_format = cls.OUTPUT_FORMAT_CSV
                    elif format_suffix == ".json":
                        output_format = cls.OUTPUT_FORMAT_JSON
                    elif format_suffix == ".jsonl":
                        output_format = cls.OUTPUT_FORMAT_JSONL
                    else:
                        output_format = cls.OUTPUT_FORMAT_DEFAULT

            return cls._setup(column_names=column_names,
                              file_path=file_path,
                              who=who,
                              file_out=gzip_file,
                              require_all_columns=require_all_columns,
                              prohibit_extra_columns=prohibit_extra_columns,
                              fill_missing_columns=fill_missing_columns,
                              error_file=error_file,
                              header_error_action=header_error_action,
                              use_mgzip=use_mgzip,
                              mgzip_threads=mgzip_threads,
                              gzip_in_parallel=gzip_in_parallel,
                              gzip_queue_size=gzip_queue_size,
                              column_separator=column_separator,
                              mode=mode,
                              output_format=output_format,
                              output_column_names=output_column_names,
                              old_column_names=old_column_names,
                              new_column_names=new_column_names,
                              verbose=verbose,
                              very_verbose=very_verbose,
            )
            
        else:
            if output_format is None:
                if file_path.suffix == ".md":
                    output_format = cls.OUTPUT_FORMAT_MD
                elif file_path.suffix == ".csv":
                    output_format = cls.OUTPUT_FORMAT_CSV
                elif file_path.suffix == ".json":
                    output_format = cls.OUTPUT_FORMAT_JSON
                elif file_path.suffix == ".jsonl":
                    output_format = cls.OUTPUT_FORMAT_JSONL
                else:
                    output_format = cls.OUTPUT_FORMAT_DEFAULT

            if verbose:
                print("KgtkWriter: writing file %s" % str(file_path), file=error_file, flush=True)
            return cls._setup(column_names=column_names,
                              file_path=file_path,
                              who=who,
                              file_out=open(file_path, "w"),
                              require_all_columns=require_all_columns,
                              prohibit_extra_columns=prohibit_extra_columns,
                              fill_missing_columns=fill_missing_columns,
                              error_file=error_file,
                              header_error_action=header_error_action,
                              use_mgzip=use_mgzip,
                              mgzip_threads=mgzip_threads,
                              gzip_in_parallel=gzip_in_parallel,
                              gzip_queue_size=gzip_queue_size,
                              column_separator=column_separator,
                              mode=mode,
                              output_format=output_format,
                              output_column_names=output_column_names,
                              old_column_names=old_column_names,
                              new_column_names=new_column_names,
                              verbose=verbose,
                              very_verbose=very_verbose,
)
Esempio n. 58
0
def open_with_compression(filename, mode='r'):
    """
    Wrapper around builtin `open` that will guess compression of a file
    from the filename and open it for reading or writing as if it were
    a standard file.

    Implemented for ``gz``(gzip), ``bz2``(bzip2) and ``xz``(lzma). Either
    Python 3 or the ``backports.lzma`` module are required for ``xz``.

    Supported modes are:
       * 'r', 'rt', 'w', 'wt' for text mode read and write.
       * 'rb, 'wb' for binary read and write.
    Depending on the Python version, you may get errors trying to write the
    wrong string type to the file.

    Parameters
    ==========
    filename: str
        Path to the file to open, including any extensions that indicate
        the compression used.
    mode: str
        Mode to open the file, same as for builtin ``open``, e.g 'r', 'w'.

    Returns
    =======
    fd: file
        File-like object open with the specified mode.
    """

    if sys.version_info[0] > 2:
        # Compressed formats sometimes default to binary, so force
        # text mode in Python 3.
        if mode == 'r':
            mode = 'rt'
        elif mode == 'w':
            mode = 'wt'
        elif mode == 'a':
            mode = 'at'
    else:
        # The version of gzip in Anaconda Python 2 on Windows forcibly
        # adds a 'b', so strip any 't' and let the string conversions
        # be carried out implicitly by Python.
        mode = mode.strip('t')

    root, compression = get_compression(filename)

    if compression is None:
        return open(filename, mode)
    elif compression == 'gz':
        import gzip
        fd = gzip.open(filename, mode=mode)
    elif compression == 'bz2':
        import bz2
        if hasattr(bz2, 'open'):
            # Python 3 only
            fd = bz2.open(filename, mode=mode)
        else:
            # Python 2
            fd = bz2.BZ2File(filename, mode=mode)
    elif compression == 'xz':
        try:
            from lzma import open as lzma_open
        except ImportError:
            from backports.lzma import open as lzma_open
        fd = lzma_open(filename, mode)
    else:
        fd = open(filename, mode)

    return fd
Esempio n. 59
0
 def save(self, pklfile):
     #print("Saving to {}".format(pklfile))
     with bz2.open(pklfile, "wb") as fout:
         pickle.dump(self, fout)
Esempio n. 60
0
        if "Синоними" in text:
            result = syns_list_re.search(text)
            if result and result.group(0).strip():
                syns = cleanup_syns_list(result.group(0).strip())
    return (word, syns)

waiting_ns = False
waiting_text = False
text = ''
title = None
all_count = 0
bg_count = 0
syn_count = 0
lang_re = re.compile(r"\{\{-bg-\}\}|ЕЗИК\s*=\s*bg|ЕЗИК\s*=\s*български", re.M | re.UNICODE)
with open('bg_wiktionary_syns.txt', 'w') as output_file:
    for (event, elem) in etree.iterparse(bz2.open("wiktionary.xml.bz2"), events=['start', 'end']):
        if (event == 'start' and elem.tag[-4:] == 'page'):
            waiting_ns = True
            continue
        if (event == 'end' and elem.tag[-4:] == 'page'):
            waiting_ns = False
            waiting_text = False
            title = None
            continue
        if (event == 'end' and waiting_ns and elem.tag[-5:] == 'title'):
            title = elem.text
            continue
        if (waiting_ns and event == 'end' and 'ns' == elem.tag[-2:]):
            if elem.text.strip() == '0':
                waiting_text = True
            continue