Exemple #1
0
 def cat(self, path, opts):
     addedopts = getopts(opts, ['libjar'], delete=False)
     streamingjar = findjar(self.hadoop, 'streaming')
     if not streamingjar:
         print >> sys.stderr, 'ERROR: Streaming jar not found'
         return 1
     hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'],
                     shortcuts=dict(configopts('jars')))
     try:
         import typedbytes
         ls = os.popen('%s %s dfs -ls %s' % (hadenv, self.hdfs, path))
         if sum(c in path for c in ("*", "?", "{")) > 0:
             # cat each file separately when the path contains special chars
             lineparts = (line.split()[-1] for line in ls)
             subpaths = [part for part in lineparts if part.startswith("/")]
         else:
             # we still do the ls even in this case to make sure we print errors 
             subpaths = [path]
         ls.close()
         for subpath in subpaths:
             if subpath.endswith("/_logs"):
                 continue
             dumptb = os.popen('%s %s/bin/hadoop jar %s dumptb %s 2> /dev/null'
                               % (hadenv, self.hadoop, streamingjar, subpath))
             ascodeopt = getopt(opts, 'ascode')
             if ascodeopt and ascodeopt[0] == 'yes':
                 outputs = dumpcode(typedbytes.PairedInput(dumptb))
             else:
                 outputs = dumptext(typedbytes.PairedInput(dumptb))
             for output in outputs:
                 print '\t'.join(output)
             dumptb.close()
     except IOError:
         pass  # ignore
     return 0
Exemple #2
0
    def cat(self, path, opts):
        streamingjar = findjar(
            self.hadoop, 'streaming',
            opts['hadooplib'] if 'hadooplib' in opts else None)
        if not streamingjar:
            print >> sys.stderr, 'ERROR: Streaming jar not found'
            return 1
        hadenv = envdef('HADOOP_CLASSPATH',
                        opts['libjar'],
                        shortcuts=dict(configopts('jars')))
        try:
            import typedbytes
            ls = os.popen('%s %s -ls %s' % (hadenv, self.hdfs, path))
            subpaths = [line.split()[-1] for line in ls if ":" in line]
            ls.close()
            for subpath in subpaths:
                if subpath.split("/")[-1].startswith("_"):
                    continue
                dumptb = os.popen(
                    '%s %s/bin/hadoop jar %s dumptb %s 2> /dev/null' %
                    (hadenv, self.hadoop, streamingjar, subpath))

                dump = dumpcode if 'yes' in opts['ascode'] else dumptext
                outputs = dump(typedbytes.PairedInput(dumptb))

                for output in outputs:
                    print '\t'.join(output)
                dumptb.close()
        except IOError:
            pass  # ignore
        return 0
Exemple #3
0
 def testpairio(self):
     objects = TestIO.objects
     file = open("test.bin", "wb")
     output = typedbytes.PairedOutput(file)
     output.writes(enumerate(objects))
     file.close()
     file = open("test.bin", "rb")
     input = typedbytes.PairedInput(file)
     for index, record in input:
         self.assertEqual(objects[index], record)
     file.close()
     os.remove("test.bin")
Exemple #4
0
import typedbytes
import sys

b = typedbytes.PairedInput(sys.stdin)
c = typedbytes.PairedOutput(sys.stdout)
c.writes(b)
Exemple #5
0
def parse_tb(val):
    fp = StringIO.StringIO(val)
    for x in typedbytes.PairedInput(fp):
        yield x
Exemple #6
0
# Loads a corpus of sentences in typedbytes format from Hadoop into a database.

import os

os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'

import typedbytes
from ratings.models import *

file = open("/Users/pealco/archive/experiments/disagreement/data.tb", 'rb')
input = typedbytes.PairedInput(file)

for sha1, sentence in input:

    s = Sentence(sha1=sha1,
                 sentence=sentence.sentence,
                 grammatical=sentence.grammatical,
                 similarity=sentence.wup_similarity)
    s.save()

    dg = DirectedGraph()
    dg.save()

    nodelist = [Node() for node in sentence.dg.nodelist]
    [node.save() for node in nodelist]

    subject_address = sentence.subject['address']
    intervenor_address = sentence.intervenor['address']
    verb_address = verb.intervenor['address']

    for node in sentence.dg.nodelist: