Example #1
0
def stream_items(thrift_data):
    '''
    Iterator over the StreamItems from a buffer of thrift data
    '''
    ## wrap it in a file obj, thrift transport, and thrift protocol
    transport = StringIO(thrift_data)        
    transport.seek(0)
    transport = TTransport.TBufferedTransport(transport)
    protocol = TBinaryProtocol.TBinaryProtocol(transport)

    ## read stream-item instances until input buffer is exhausted
    while 1:

        ## instantiate a StreamItem instance from kba_thrift
        doc = StreamItem()

        try:
            ## read it from the thrift protocol instance
            doc.read(protocol)
            ## This has deserialized the data analogous to
            ## json.loads(line).  The StreamItem from the thrift
            ## format is the analog of the JSON stream-item; see
            ## http://trec-kba.org/schemas/v1.0/stream-item.json

            ## yield is python primitive for iteration
            yield doc

        except EOFError:
            break
Example #2
0
def stream_items(thrift_data):
    '''
    Iterator over the StreamItems from a buffer of thrift data
    '''
    ## wrap it in a file obj, thrift transport, and thrift protocol
    transport = StringIO(thrift_data)
    transport.seek(0)
    transport = TTransport.TBufferedTransport(transport)
    protocol = TBinaryProtocol.TBinaryProtocol(transport)

    ## read stream-item instances until input buffer is exhausted
    while 1:

        ## instantiate a StreamItem instance from kba_thrift
        doc = StreamItem()

        try:
            ## read it from the thrift protocol instance
            doc.read(protocol)
            ## This has deserialized the data analogous to
            ## json.loads(line).  The StreamItem from the thrift
            ## format is the analog of the JSON stream-item; see
            ## http://trec-kba.org/schemas/v1.0/stream-item.json

            ## yield is python primitive for iteration
            yield doc

        except EOFError:
            break
Example #3
0
  def parse_thift_data(self, thrift_dir):
    '''
    Parse the thift data in a given directory, apply exact matching over
    the streaming documents
    '''
    for fname in os.listdir(thrift_dir):
      ## ignore other files, e.g. stats.json
      if fname.endswith('.gpg'): continue
      if fname.endswith('.xz'): continue

      ## verbose output
      #print 'Process %s' % fname

      ### reverse the steps from above:
      ## load the encrypted data
      fpath = os.path.join(thrift_dir, fname)
      thrift_data = open(fpath).read()

      assert len(thrift_data) > 0, "failed to load: %s" % fpath

      ## wrap it in a file obj, thrift transport, and thrift protocol
      transport = StringIO(thrift_data)
      transport.seek(0)
      transport = TTransport.TBufferedTransport(transport)
      protocol = TBinaryProtocol.TBinaryProtocol(transport)

      ## iterate over all thrift items
      while 1:
        stream_item = StreamItem()
        try:
          stream_item.read(protocol)
        except EOFError:
          break

        ## process data
        stream_id = stream_item.stream_id
        if stream_id in self._missed_docs:
          for urlname in self._missed_docs[stream_id]:
            id = self._missed_docs_db.llen(RedisDB.ret_item_list)
            ret_item = {'id' : id}
            ret_item['file'] = fname
            ret_item['query'] = urlname
            ret_item['rating'] = self._missed_docs[stream_id][urlname]
            ret_item['stream_id'] = stream_id
            ret_item['stream_data'] = stream_item.body.cleansed

            self._missed_docs_db.hmset(id, ret_item)
            self._missed_docs_db.rpush(RedisDB.ret_item_list, id)
            print 'Missed %s %s\n\n\n' %(urlname, stream_id)

        ## suppress the verbose output
        #print '%s' % stream_item.doc_id

      ## close that transport
      transport.close()

      # free memory
      thrift_data = None
Example #4
0
  def get(self, date, file, epoch, doc_id):
    date_dir = os.path.join(corpus_dir, date)
    target_id = '%s-%s' %(epoch, doc_id)

    if not os.path.isdir(date_dir):
      msg = 'directory %s can not be opened' %date_dir
      #raise tornado.web.HTTPError(404, log_message=msg)
      self.render("error.html", msg=msg)
      return

    doc = Doc()
    doc['title'] = 'Null'
    doc['body'] = 'Null'
    doc['anchor'] = 'Null'
    doc['date'] = date
    doc['file'] = file
    doc['time'] = datetime.datetime.utcfromtimestamp(float(epoch)).ctime()
    doc['id'] = target_id

    fpath = os.path.join(date_dir, file)
    thrift_data = open(fpath).read()

    if not len(thrift_data) > 0:
      msg = 'failed to load: %s' % fpath
      #raise tornado.web.HTTPError(404, log_message=msg)
      self.render("error.html", msg=msg)
      return

    ## wrap it in a file obj, thrift transport, and thrift protocol
    transport = StringIO(thrift_data)
    transport.seek(0)
    transport = TTransport.TBufferedTransport(transport)
    protocol = TBinaryProtocol.TBinaryProtocol(transport)

    ## iterate over all thrift items
    while 1:
      stream_item = StreamItem()
      try:
        stream_item.read(protocol)
        if stream_item.stream_id == target_id:
          found = True
          #doc['title'] = stream_item.title.cleansed
          #doc['body'] = stream_item.body.cleansed
          #doc['anchor'] = stream_item.anchor.cleansed
          doc['title'] = stream_item.title.raw
          doc['body'] = stream_item.body.raw
          doc['anchor'] = stream_item.anchor.raw
          break
      except EOFError:
        break

    self.render("doc.html", title=doc_id, doc=doc)
Example #5
0
  def get(self, date, file, epoch, doc_id):
    date_dir = os.path.join(corpus_dir, date)
    target_id = '%s-%s' %(epoch, doc_id)

    if not os.path.isdir(date_dir):
      msg = 'directory %s can not be opened' %date_dir
      #raise tornado.web.HTTPError(404, log_message=msg)
      self.render("error.html", msg=msg)
      return

    doc = Doc()
    doc['title'] = 'Null'
    doc['body'] = 'Null'
    doc['anchor'] = 'Null'
    doc['date'] = date
    doc['file'] = file
    doc['time'] = datetime.datetime.utcfromtimestamp(float(epoch)).ctime()
    doc['id'] = target_id

    fpath = os.path.join(date_dir, file)
    thrift_data = open(fpath).read()

    if not len(thrift_data) > 0:
      msg = 'failed to load: %s' % fpath
      #raise tornado.web.HTTPError(404, log_message=msg)
      self.render("error.html", msg=msg)
      return

    ## wrap it in a file obj, thrift transport, and thrift protocol
    transport = StringIO(thrift_data)
    transport.seek(0)
    transport = TTransport.TBufferedTransport(transport)
    protocol = TBinaryProtocol.TBinaryProtocol(transport)

    ## iterate over all thrift items
    while 1:
      stream_item = StreamItem()
      try:
        stream_item.read(protocol)
        if stream_item.stream_id == target_id:
          found = True
          doc['title'] = stream_item.title.cleansed
          doc['body'] = stream_item.body.cleansed
          doc['anchor'] = stream_item.anchor.cleansed
          break
      except EOFError:
        break

    self.render("doc.html", title=doc_id, doc=doc)
Example #6
0
    def parse_thift_data(self, thrift_dir):
        """
    Parse the thift data in a given directory, apply exact matching over
    the streaming documents
    """
        for fname in os.listdir(thrift_dir):
            ## ignore other files, e.g. stats.json
            if fname.endswith(".gpg"):
                continue
            if fname.endswith(".xz"):
                continue

            ## verbose output
            print "Process %s" % fname

            ### reverse the steps from above:
            ## load the encrypted data
            fpath = os.path.join(thrift_dir, fname)
            thrift_data = open(fpath).read()

            assert len(thrift_data) > 0, "failed to load: %s" % fpath

            ## wrap it in a file obj, thrift transport, and thrift protocol
            transport = StringIO(thrift_data)
            transport.seek(0)
            transport = TTransport.TBufferedTransport(transport)
            protocol = TBinaryProtocol.TBinaryProtocol(transport)

            ## iterate over all thrift items
            while 1:
                stream_item = StreamItem()
                try:
                    stream_item.read(protocol)
                except EOFError:
                    break

                ## process data
                self.process_stream_item(fname, stream_item.stream_id, stream_item.body.cleansed)
                ## suppress the verbose output
                # print '%s' % stream_item.doc_id

            ## close that transport
            transport.close()

            # free memory
            thrift_data = None
Example #7
0
    def parse_thift_data(self, thrift_dir):
        '''
    Parse the thift data in a given directory, apply exact matching over
    the streaming documents
    '''
        for fname in os.listdir(thrift_dir):
            ## ignore other files, e.g. stats.json
            if fname.endswith('.gpg'): continue
            if fname.endswith('.xz'): continue

            ## verbose output
            print 'Process %s' % fname

            ### reverse the steps from above:
            ## load the encrypted data
            fpath = os.path.join(thrift_dir, fname)
            thrift_data = open(fpath).read()

            assert len(thrift_data) > 0, "failed to load: %s" % fpath

            ## wrap it in a file obj, thrift transport, and thrift protocol
            transport = StringIO(thrift_data)
            transport.seek(0)
            transport = TTransport.TBufferedTransport(transport)
            protocol = TBinaryProtocol.TBinaryProtocol(transport)

            ## iterate over all thrift items
            while 1:
                stream_item = StreamItem()
                try:
                    stream_item.read(protocol)
                except EOFError:
                    break

                ## process data
                self.process_stream_item(fname, stream_item.stream_id,
                                         stream_item.body.cleansed)
                ## suppress the verbose output
                #print '%s' % stream_item.doc_id

            ## close that transport
            transport.close()

            # free memory
            thrift_data = None
Example #8
0
    def ProcessThriftFile(self, fpath):
        thrift_data = open(fpath).read()

        if not len(thrift_data) > 0:
            msg = 'failed to load: %s' % fpath
            print 'Error: %s' % (msg)
            return

        #print 'Processing %s' %( fpath )

        ## wrap it in a file obj, thrift transport, and thrift protocol
        transport = StringIO(thrift_data)
        transport.seek(0)
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)

        docs = []
        ## iterate over all thrift items
        while 1:
            stream_item = StreamItem()
            try:
                stream_item.read(protocol)
                doc = Doc()
                doc.id = stream_item.stream_id
                doc.epoch = stream_item.stream_time.epoch_ticks
                doc.time = datetime.datetime.utcfromtimestamp(
                    doc.epoch).ctime()
                doc.title = stream_item.title.cleansed
                doc.body = stream_item.body.cleansed
                doc.anchor = stream_item.anchor.cleansed

                self.SaveDoc(doc)
            except EOFError:
                break

        ## close that transport
        transport.close()

        # free memory
        thrift_data = None
Example #9
0
  def get(self, date, file):

    ## load the thrift data
    fpath = os.path.join(corpus_dir, date, file)
    thrift_data = open(fpath).read()

    if not len(thrift_data) > 0:
      msg = 'failed to load: %s' % fpath
      #raise tornado.web.HTTPError(404, log_message=msg)
      self.render("error.html", msg=msg)
      return

    ## wrap it in a file obj, thrift transport, and thrift protocol
    transport = StringIO(thrift_data)
    transport.seek(0)
    transport = TTransport.TBufferedTransport(transport)
    protocol = TBinaryProtocol.TBinaryProtocol(transport)

    docs = []
    ## iterate over all thrift items
    while 1:
      stream_item = StreamItem()
      try:
        stream_item.read(protocol)
        doc = Doc()
        doc.id = stream_item.stream_id
        doc.epoch = stream_item.stream_time.epoch_ticks
        doc.time = datetime.datetime.utcfromtimestamp(doc.epoch).ctime()
        docs.append(doc)
      except EOFError:
        break

    ## close that transport
    transport.close()

    # free memory
    thrift_data = None

    self.render("file-index.html", title=file, date=date, file=file, docs=docs)
Example #10
0
  def get(self, date, file):

    ## load the thrift data
    fpath = os.path.join(corpus_dir, date, file)
    thrift_data = open(fpath).read()

    if not len(thrift_data) > 0:
      msg = 'failed to load: %s' % fpath
      #raise tornado.web.HTTPError(404, log_message=msg)
      self.render("error.html", msg=msg)
      return

    ## wrap it in a file obj, thrift transport, and thrift protocol
    transport = StringIO(thrift_data)
    transport.seek(0)
    transport = TTransport.TBufferedTransport(transport)
    protocol = TBinaryProtocol.TBinaryProtocol(transport)

    docs = []
    ## iterate over all thrift items
    while 1:
      stream_item = StreamItem()
      try:
        stream_item.read(protocol)
        doc = Doc()
        doc.id = stream_item.stream_id
        doc.epoch = stream_item.stream_time.epoch_ticks
        doc.time = datetime.datetime.utcfromtimestamp(doc.epoch).ctime()
        docs.append(doc)
      except EOFError:
        break

    ## close that transport
    transport.close()

    # free memory
    thrift_data = None

    self.render("file-index.html", title=file, date=date, file=file, docs=docs)
Example #11
0
  def ProcessThriftFile(self, fpath):
    thrift_data = open(fpath).read()

    if not len(thrift_data) > 0:
      msg = 'failed to load: %s' % fpath
      print 'Error: %s' % (msg)
      return

    #print 'Processing %s' %( fpath )

    ## wrap it in a file obj, thrift transport, and thrift protocol
    transport = StringIO(thrift_data)
    transport.seek(0)
    transport = TTransport.TBufferedTransport(transport)
    protocol = TBinaryProtocol.TBinaryProtocol(transport)

    docs = []
    ## iterate over all thrift items
    while 1:
      stream_item = StreamItem()
      try:
        stream_item.read(protocol)
        doc = Doc()
        doc.id = stream_item.stream_id
        doc.epoch = stream_item.stream_time.epoch_ticks
        doc.time = datetime.datetime.utcfromtimestamp(doc.epoch).ctime()
        doc.title = stream_item.title.cleansed
        doc.body = stream_item.body.cleansed
        doc.anchor = stream_item.anchor.cleansed

        self.SaveDoc(doc)
      except EOFError:
        break

    ## close that transport
    transport.close()

    # free memory
    thrift_data = None
Example #12
0
  def get(self, epoch, id):
    time = datetime.datetime.utcfromtimestamp(float(epoch))
    date = '%d-%.2d-%.2d-%.2d' %(time.year, time.month, time.day, time.hour)
    date_dir = os.path.join(corpus_dir, date)

    target_id = '%s-%s' %(epoch, id)

    if not os.path.isdir(date_dir):
      msg = 'directory %s can not be opened' %date_dir
      #raise tornado.web.HTTPError(404, log_message=msg)
      self.set_status(404)
      self.render("error.html", msg=msg)
      return

    doc = Doc()
    doc['title'] = 'Null'
    doc['body'] = 'Null'
    doc['anchor'] = 'Null'
    doc['date'] = date
    doc['file'] = 'Null'
    doc['time'] = datetime.datetime.utcfromtimestamp(float(epoch)).ctime()
    doc['id'] = target_id
    #self.write('searching')
    #self.flush()

    for fname in os.listdir(date_dir):
      ## ignore other files
      if fname.endswith('.gpg'): continue
      if fname.endswith('.xz'): continue

      fpath = os.path.join(date_dir, fname)
      thrift_data = open(fpath).read()

      if not len(thrift_data) > 0:
        msg = 'failed to load: %s' % fpath
        #raise tornado.web.HTTPError(404, log_message=msg)
        self.render("error.html", msg=msg)
        return

      ## wrap it in a file obj, thrift transport, and thrift protocol
      transport = StringIO(thrift_data)
      transport.seek(0)
      transport = TTransport.TBufferedTransport(transport)
      protocol = TBinaryProtocol.TBinaryProtocol(transport)

      found = False

      ## iterate over all thrift items
      while 1:
        stream_item = StreamItem()
        try:
          stream_item.read(protocol)
          if stream_item.stream_id == target_id:
            found = True
            doc['title'] = stream_item.title.cleansed
            doc['body'] = stream_item.body.cleansed
            doc['anchor'] = stream_item.anchor.cleansed
            doc['file'] = fname
            break
        except EOFError:
          break

      if found: break

    self.render("doc.html", title=target_id, doc=doc)
Example #13
0
  def parse_thift_data(self, corpus_dir):
    '''
    Parse the thift data, find the files in which the stream_items are
    '''
    for item in sorted(self._item_list, key=lambda item:item['id']):
      ## skip the existing items
      if self._eval_db.hexists(item['id'], 'id'):
        print 'Skipping %d' %item['id']
        continue

      target_id = item['stream_id']
      list = target_id.split('-')
      epoch = list[0]

      time = datetime.datetime.utcfromtimestamp(float(epoch))
      date = '%d-%.2d-%.2d-%.2d' %(time.year, time.month, time.day, time.hour)
      date_dir = os.path.join(corpus_dir, date)

      if not os.path.isdir(date_dir):
        print 'directory %s can no be opened' %date_dir
        continue

      found = False
      for fname in os.listdir(date_dir):
        ## ignore other files, e.g. stats.json
        if fname.endswith('.gpg'): continue
        if fname.endswith('.xz'): continue

        ### reverse the steps from above:
        ## load the encrypted data
        fpath = os.path.join(date_dir, fname)
        thrift_data = open(fpath).read()

        if not len(thrift_data) > 0:
          print "failed to load: %s" % fpath
          continue

        ## wrap it in a file obj, thrift transport, and thrift protocol
        transport = StringIO(thrift_data)
        transport.seek(0)
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)

        ## iterate over all thrift items
        while 1:
          stream_item = StreamItem()
          try:
            stream_item.read(protocol)
          except EOFError:
            break

          if stream_item.stream_id == target_id:
            self.process_stream_item(fname, item, stream_item.body.cleansed)
            found = True
            break

        ## close that transport
        transport.close()

        # free memory
        thrift_data = None

        if found:
          print 'Item %d processed' %item['id']
          break

      if not found:
        print 'Item %d (%s) can not be found in any file' %(item['id'],
            item['stream_id'])
Example #14
0
  def get(self, epoch, id):
    time = datetime.datetime.utcfromtimestamp(float(epoch))
    date = '%d-%.2d-%.2d-%.2d' %(time.year, time.month, time.day, time.hour)

    if 2011 == time.year:
      corpus_dir = './corpus/training'
    else:
      corpus_dir = './corpus/testing'
    date_dir = os.path.join(corpus_dir, date)

    target_id = '%s-%s' %(epoch, id)

    if not os.path.isdir(date_dir):
      msg = 'directory %s can not be opened' %date_dir
      #raise tornado.web.HTTPError(404, log_message=msg)
      self.set_status(404)
      self.render("error.html", msg=msg)
      return

    doc = Doc()
    doc['title'] = 'Null'
    doc['body'] = 'Null'
    doc['anchor'] = 'Null'
    doc['date'] = date
    doc['file'] = 'Null'
    doc['time'] = datetime.datetime.utcfromtimestamp(float(epoch)).ctime()
    doc['id'] = target_id
    #self.write('searching')
    #self.flush()

    for fname in os.listdir(date_dir):
      ## ignore other files
      if fname.endswith('.gpg'): continue
      if fname.endswith('.xz'): continue

      fpath = os.path.join(date_dir, fname)
      thrift_data = open(fpath).read()

      if not len(thrift_data) > 0:
        msg = 'failed to load: %s' % fpath
        #raise tornado.web.HTTPError(404, log_message=msg)
        self.render("error.html", msg=msg)
        return

      ## wrap it in a file obj, thrift transport, and thrift protocol
      transport = StringIO(thrift_data)
      transport.seek(0)
      transport = TTransport.TBufferedTransport(transport)
      protocol = TBinaryProtocol.TBinaryProtocol(transport)

      found = False

      ## iterate over all thrift items
      while 1:
        stream_item = StreamItem()
        try:
          stream_item.read(protocol)
          if stream_item.stream_id == target_id:
            found = True
            doc['title'] = stream_item.title.cleansed
            doc['body'] = stream_item.body.cleansed
            doc['anchor'] = stream_item.anchor.cleansed
            doc['file'] = fname
            break
        except EOFError:
          break

      if found: break

    self.render("doc.html", title=target_id, doc=doc)
Example #15
0
    def parse_thift_data(self, corpus_dir):
        '''
    Parse the thift data, find the files in which the stream_items are
    '''
        for item in sorted(self._item_list, key=lambda item: item['id']):
            ## skip the existing items
            if self._eval_db.hexists(item['id'], 'id'):
                print 'Skipping %d' % item['id']
                continue

            target_id = item['stream_id']
            list = target_id.split('-')
            epoch = list[0]

            time = datetime.datetime.utcfromtimestamp(float(epoch))
            date = '%d-%.2d-%.2d-%.2d' % (time.year, time.month, time.day,
                                          time.hour)
            date_dir = os.path.join(corpus_dir, date)

            if not os.path.isdir(date_dir):
                print 'directory %s can no be opened' % date_dir
                continue

            found = False
            for fname in os.listdir(date_dir):
                ## ignore other files, e.g. stats.json
                if fname.endswith('.gpg'): continue
                if fname.endswith('.xz'): continue

                ### reverse the steps from above:
                ## load the encrypted data
                fpath = os.path.join(date_dir, fname)
                thrift_data = open(fpath).read()

                if not len(thrift_data) > 0:
                    print "failed to load: %s" % fpath
                    continue

                ## wrap it in a file obj, thrift transport, and thrift protocol
                transport = StringIO(thrift_data)
                transport.seek(0)
                transport = TTransport.TBufferedTransport(transport)
                protocol = TBinaryProtocol.TBinaryProtocol(transport)

                ## iterate over all thrift items
                while 1:
                    stream_item = StreamItem()
                    try:
                        stream_item.read(protocol)
                    except EOFError:
                        break

                    if stream_item.stream_id == target_id:
                        self.process_stream_item(fname, item,
                                                 stream_item.body.cleansed)
                        found = True
                        break

                ## close that transport
                transport.close()

                # free memory
                thrift_data = None

                if found:
                    print 'Item %d processed' % item['id']
                    break

            if not found:
                print 'Item %d (%s) can not be found in any file' % (
                    item['id'], item['stream_id'])