Python CorpusBuilderの例、CorpusBuilder.CorpusBuilder Pythonの例

コード例 #1

0

ファイルを表示

 def processQueries(self):
     self.rel_path = os.path.dirname(os.path.abspath(__file__))
     filename = self.rel_path+self.query_file_name
     self.rawQueries = collections.OrderedDict()
     with open(filename, 'r', encoding="utf-8") as file:
         data=file.read()
         
         soup = BeautifulSoup(data, "html.parser")
         content = soup.findAll("doc")
         for entry in content:
             query_id_tag = entry.find("docno")
             queryid= query_id_tag.getText().strip()
             query_id_tag.extract()
             query = entry.getText().strip()
             self.rawQueries[queryid] = query
             
     self.queries = collections.OrderedDict()   
     corpusbuilder = CorpusBuilder()
     for query_id in self.rawQueries.keys():
         query_text = self.rawQueries[query_id]
         if(self.stopping_required):
             query_text_list = query_text.split()
             s_query= self.remove_stopWords(self.stopwords_list, query_text_list)
             query_text = ''.join(s_query[0:len(s_query)-1])
         content =""
         content = corpusbuilder.punctuation_handler(content, self.rawQueries[query_id])
         self.queries[query_id]=content

コード例 #2

0

ファイルを表示

    def __init__(self, input_path, output_folder, is_verbose, foreground,
                 self_bg, mel_bg, harm_bg, uses_mel_from_json,
                 uses_held_melodics, uses_legacy_parser):
        self.logger = Main.init_logger(is_verbose)

        if not os.path.isabs(input_path):
            input_path = os.path.normpath(os.getcwd() + '/' + input_path)

        self.logger.debug(
            'Script was initialized with the following parameters:\n' +
            settings.DEBUG_INDENT +
            'Input file/folder: {0}\n'.format(input_path) +
            settings.DEBUG_INDENT +
            'Output folder: {0}\n'.format(output_folder) +
            settings.DEBUG_INDENT +
            'Foreground channel(s): {0}\n'.format(foreground) +
            settings.DEBUG_INDENT +
            'Self Background channel(s): {0}\n'.format(self_bg) +
            settings.DEBUG_INDENT +
            'Melodic Background channel(s): {0}\n'.format(mel_bg) +
            settings.DEBUG_INDENT +
            'Harmonic Background channel(s): {0}\n'.format(harm_bg))

        # NOTE! This primitive check will print a warning if the user is building the json file to a folder that is not
        #       named `corpus`. If the naming of the corpus folder will change in future versions,
        #       this check will be incorrect and must be corrected.
        if os.path.normpath(os.path.basename(
                output_folder)) != settings.CORPUS_FOLDER_NAME:
            self.logger.warn(
                'Output folder is not set to default and will likely not be available inside SoMax, '
                'is this intentional?\n'
                'To ensure correct behaviour, please either run the script directly inside\n'
                'the corpus folder of SoMax or use the -o option to point to this directory.'
            )

        builder = CorpusBuilder(input_path,
                                foreground_channels=foreground,
                                self_bg_channels=self_bg,
                                mel_bg_channels=mel_bg,
                                harm_bg_channels=harm_bg,
                                uses_legacy_parser=uses_legacy_parser)

        # Build the corpus and write all the files (standard, harmonic and melodic)
        output_filepaths = builder.build_corpus(
            os.path.normpath(output_folder) + '/')
        log_string = "The following files were written:"
        for fp in output_filepaths:
            log_string += '\n' + settings.INFO_INDENT + fp
        self.logger.info(log_string)

        # Overwrite the generated melodic json file if flag is set
        if uses_mel_from_json:
            self.generate_mel_from_json(output_filepaths, uses_held_melodics)

コード例 #3

0

ファイルを表示

 def runCorpusBuilder(self, is_stemmed_corpus):
     corpusBuilder = CorpusBuilder()
     if (is_stemmed_corpus):
         corpusBuilder.create_stemmed_corpus_files()
     else:
         corpusBuilder.initialize_doc_keys()
         corpusBuilder.create_corpus()

コード例 #4

0

ファイルを表示

    def processStemmedQueries(self):
        self.rel_path = os.path.dirname(os.path.abspath(__file__))
        filename = self.rel_path+self.stemmed_query_file
        self.rawQueries = collections.OrderedDict()
        with open(filename, 'r', encoding="utf-8") as file:
            lines=file.read().splitlines()

            for query_id,query in enumerate(lines):
                self.rawQueries[str(query_id+1)] = query
                 
        self.queries = collections.OrderedDict()   
        corpusbuilder = CorpusBuilder()
        for query_id in self.rawQueries.keys():
            content =""
            content = corpusbuilder.punctuation_handler(content, self.rawQueries[query_id])
            self.queries[query_id]=content

コード例 #5

0

ファイルを表示

def buildCorpus(outFN, block_size, stream_size, text_ids):
    if len(text_ids) == 0:
        print >>sys.stderr, "Must provide at least one input ID"
        sys.exit(0)

    builder = CorpusBuilder(outFN)

    total_bytes = 0
    stream_id = 0
    stream_bytes = 0

    for text_id in text_ids:
        text_id = int(text_id)
        text = gutenberg.acquire.load_etext(text_id)
        text = gutenberg.cleanup.strip_headers(text).strip()
        addBlocks(builder, block_size, stream_size, text_id, text)
        total_bytes += len(text)

    builder.finish()

    print "Total:", total_bytes, "bytes."

コード例 #6

0

ファイルを表示

ファイル: linebasedCorpus.py プロジェクト: gitter-badger/klogg

def lineCorpus(inFN, outFN):
    '''
    Read lines from file name @inFN and write them as blocks to a new db with
    name @outFN.
    '''

    if not os.path.exists(inFN):
        print >> sys.stderr, "Input file '%s' does not exist. Exiting." % outFN
        sys.exit(-1)

    lines = open(inFN).readlines()

    if len(lines) == 0:
        print >> sys.stderr, "Input file contained no lines. Exiting."
        sys.exit(0)

    builder = CorpusBuilder(outFN)

    # write a single stream to contain everything
    streamId = 0

    for l in lines:
        builder.add_chunk(streamId, l.rstrip())

    builder.finish()

コード例 #7

0

ファイルを表示

ファイル: build_keith.py プロジェクト: DYCI2/Somax

from CorpusBuilder import CorpusBuilder
from ops import OpSomaxMelodic

corpus_path = "./examples/keith.mid"

builder = CorpusBuilder(corpus_path, corpus_name="keith_rh")
print builder.ops
standardOp = builder.ops[''][0]
standardOp.fgChannels = [1]
standardOp.bgChannels = [2]
MelOp = OpSomaxMelodic(standardOp.file_paths, standardOp.corpus_name)
MelOp.fgChannels = [1]
builder.ops['m'] = (MelOp, builder.ops[''][1])
builder.build_corpus("./examples/output/")

corpus_path = "./examples/keith.mid"

builder = CorpusBuilder(corpus_path, corpus_name="keith_lh")
standardOp = builder.ops[''][0]
standardOp.fgChannels = [2]
standardOp.bgChannels = [1]
MelOp = OpSomaxMelodic(standardOp.file_paths, standardOp.corpus_name)
MelOp.fgChannels = [1]
builder.ops['m'] = (MelOp, builder.ops[''][1])

builder.build_corpus("./examples/output/")

コード例 #8

0

ファイルを表示

ファイル: build_debussy.py プロジェクト: DYCI2/Somax

from CorpusBuilder import CorpusBuilder
from ops import OpSomaxMelodic, OpSomaxHarmonic

corpus_path = "./examples/debussy.mid"
# we build the CorpusBuilder object with the path of the corpus
builder = CorpusBuilder(corpus_path)

standardOp = builder.ops[''][0]
print builder.ops

# setting the foreground channels as in the interactive mode

standardOp.fgChannels = [2, 3, 4]
standardOp.bgChannels = [1, 2, 3, 4]

#adding an operation to the standard built one
# MelOp = OpSomaxMelodic(standardOp.file_paths, standardOp.corpus_name)
# MelOp.fgChannels = [1]
# builder.ops['m']= (MelOp, builder.ops[''][1])

# print "1: ", standardOp.file_paths
# print "2: ", standardOp.corpus_name
# print "3: ", builder.ops[''][1]

# HarmOp = OpSomaxHarmonic(standardOp.file_paths, standardOp.corpus_name)
# HarmOp.fgChannels = [1]
# builder.ops['h']= (HarmOp, builder.ops[''][1])

print builder.ops

builder.build_corpus("./examples/output/")

コード例 #9

0

ファイルを表示

from CorpusBuilder import CorpusBuilder
from ops import OpSomaxMelodic

corpus_path = "./debussy"
# we build the CorpusBuilder object with the path of the corpus
builder = CorpusBuilder(corpus_path)

StandardOp = builder.ops[''][0]
print builder.ops

# setting the foreground channels as in the interactive mode
#StandardOp.setParameter('fgChannels', '2 3 4')
#StandardOp.setParameter('bgChannels', '1 2 3 4')

# or directly into the attributes of the Op object
StandardOp.fgChannels = [2, 3, 4]
StandardOp.bgChannels = [1, 2, 3, 4]

HarmOp = builder.ops['h'][0]
HarmOp.fgChannels = [1, 2, 3, 4]
HarmOp.bgChannels = [1, 2, 3, 4]

#adding an operation to the standard built one
MelOp = OpSomaxMelodic(StandardOp.file_paths, StandardOp.corpus_name)
MelOp.fgChannels = [1]
MelOp.bgChannels = [1, 2, 3, 4]
builder.ops['m'] = (MelOp, builder.ops[''][1])

print builder.ops

builder.build_corpus("./../corpus/")

コード例 #10

0

ファイルを表示

ファイル: build_debussy.py プロジェクト: DYCI2/Somax

from CorpusBuilder import CorpusBuilder
from ops import OpSomaxMelodic

corpus_path = "./examples/debussy.mid"
# we build the CorpusBuilder object with the path of the corpus
builder = CorpusBuilder(corpus_path)



standardOp = builder.ops[''][0]
print builder.ops

# setting the foreground channels as in the interactive mode

standardOp.fgChannels = [2,3,4]
standardOp.bgChannels = [1,2,3,4]



#adding an operation to the standard built one
MelOp = OpSomaxMelodic(standardOp.file_paths, standardOp.corpus_name)
MelOp.fgChannels = [1]
builder.ops['m']= (MelOp, builder.ops[''][1])

print builder.ops

builder.build_corpus("./examples/output/")

コード例 #11

0

ファイルを表示

def enchunk_pcap(pcapFN, sqliteFN):
    """Read the contents of a pcap file with name @pcapFN and produce
    a sqlite db with name @sqliteFN. It will contain chunks of data
    from TCP and UDP streams,
    """

    if not os.path.exists(pcapFN):
        print >> sys.stderr, "Input file '%s' does not exist. Exiting." % pcapFN
        sys.exit(-1)

    builder = CorpusBuilder(sqliteFN)

    #
    # Read in the contents of the pcap file, adding stream segments as found
    #
    pkt_cnt = 0
    ip_pkt_cnt = 0
    unsupported_ip_protocol_cnt = 0
    pcap_ref = pcap.pcap(pcapFN)
    done = False

    while not done:
        try:
            ts, packet = pcap_ref.next()
        except:
            break

        pkt_cnt += 1

        linkLayerType = struct.unpack(
            '!H', packet[(pcap_ref.dloff - 2):pcap_ref.dloff])[0]
        if linkLayerType != ETHERTYPE_IP:
            #
            # We're only interested in IP packets
            #
            continue

        ip_pkt_cnt += 1

        ip_pkt_total_len = struct.unpack(
            '!H', packet[pcap_ref.dloff + 2:pcap_ref.dloff + 4])[0]
        ip_pkt = packet[pcap_ref.dloff:pcap_ref.dloff + ip_pkt_total_len]
        pkt_protocol = struct.unpack('B', ip_pkt[9])[0]

        if (pkt_protocol != IPPROTO_UDP) and (pkt_protocol != IPPROTO_TCP):
            #
            # we're only interested in UDP and TCP packets at the moment
            #
            continue

        pkt_src_addr = inet_ntoa(ip_pkt[12:16])
        pkt_dst_addr = inet_ntoa(ip_pkt[16:20])

        ip_hdr_len_offset = (ord(ip_pkt[0]) & 0x0f) * 4
        ip_payload = ip_pkt[ip_hdr_len_offset:len(ip_pkt)]

        pkt_src_port, pkt_dst_port = struct.unpack('!HH', ip_payload[0:4])
        five_tuple = FiveTuple(pkt_protocol, pkt_src_addr, pkt_src_port,
                               pkt_dst_addr, pkt_dst_port)
        five_tuple_id = str(five_tuple)

        if pkt_protocol == IPPROTO_UDP:
            udp_payload_len = struct.unpack('!H', ip_payload[4:6])[0] - 8
            udp_header = ip_payload[0:8]
            udp_payload = ip_payload[8:len(ip_payload)]
            udp_segment = UdpSegment(five_tuple, udp_header, udp_payload)
            process_udp_segment(builder, udp_segment)
        elif pkt_protocol == IPPROTO_TCP:
            tcp_hdr_len = (ord(ip_payload[12]) >> 4) * 4
            tcp_header = ip_payload[0:tcp_hdr_len]
            tcp_payload = ip_payload[tcp_hdr_len:len(ip_payload)]
            segment = TcpSegment(five_tuple, tcp_header, tcp_payload)
            process_tcp_segment(builder, segment)

    #
    # Having read the contents of the pcap, we fill the database with any
    # remaining TCP and UDP segments
    #
    for tcp_stream in tcp_streams.itervalues():
        db_add_tcp_stream_segments(builder, tcp_stream)

    for udp_stream in udp_streams.itervalues():
        db_add_udp_stream_segments(builder, udp_stream)

    #
    # We've finished with the database
    #
    builder.finish()

コード例 #12

0

ファイルを表示

ファイル: legacy_build.py プロジェクト: DYCI2/Somax

import sys, os, re, importlib, inspect, logging, argparse
from CorpusBuilder import CorpusBuilder

if __name__ == "__main__":
    if len(sys.argv) == 1:
        raise Exception("not enough arguments to the script!")
    elif len(sys.argv) == 2:
        corpus_path = sys.argv[1]
        builder = CorpusBuilder(corpus_path)
        builder.build_corpus("./")
    elif len(sys.argv) == 3:
        option = sys.argv[1]

        # interactive mode
        if option == "-i":
            corpus_path = sys.argv[2]
            builder = CorpusBuilder(corpus_path, verbose=True)
            print ""
            print "Do you want to modify an operation? (h for help)"
            cont = 1
            while cont:
                ans = raw_input("? ")
                if ans == '':
                    cont = 0
                elif ans == 'h':
                    print "type o to re-print all the operations"
                    print "type p <extension> to list the operation's paramaters of the given extension"
                    print "type r <extension> <op> to replace an operation for the given extension"
                    print "type s <extension> <parameter> <value> to change an operation parameter"
                    # print "type r <file> to remove a file in the corpus"
                elif ans == 'o':

コード例 #13

0

ファイルを表示

ファイル: pcapCorpus.py プロジェクト: tomzhang/hyperscan

def enchunk_pcap(pcapFN, sqliteFN):
    """Read the contents of a pcap file with name @pcapFN and produce
    a sqlite db with name @sqliteFN. It will contain chunks of data
    from TCP and UDP streams,
    """

    if not os.path.exists(pcapFN):
        print >> sys.stderr, "Input file '%s' does not exist. Exiting." % pcapFN
        sys.exit(-1)

    builder = CorpusBuilder(sqliteFN)

    #
    # Read in the contents of the pcap file, adding stream segments as found
    #
    pkt_cnt = 0;
    ip_pkt_cnt = 0;
    unsupported_ip_protocol_cnt = 0
    pcap_ref = pcap.pcap(pcapFN)
    done = False

    while not done:
        try:
            ts, packet = pcap_ref.next()
        except:
            break

        pkt_cnt += 1

        linkLayerType = struct.unpack('!H', packet[(pcap_ref.dloff - 2):pcap_ref.dloff])[0]
        if linkLayerType != ETHERTYPE_IP:
            #
            # We're only interested in IP packets
            #
            continue

        ip_pkt_cnt += 1

        ip_pkt_total_len = struct.unpack('!H', packet[pcap_ref.dloff + 2: pcap_ref.dloff + 4])[0]
        ip_pkt = packet[pcap_ref.dloff:pcap_ref.dloff + ip_pkt_total_len]
        pkt_protocol = struct.unpack('B', ip_pkt[9])[0]

        if (pkt_protocol != IPPROTO_UDP) and (pkt_protocol != IPPROTO_TCP):
            #
            # we're only interested in UDP and TCP packets at the moment
            #
            continue

        pkt_src_addr = inet_ntoa(ip_pkt[12:16])
        pkt_dst_addr = inet_ntoa(ip_pkt[16:20])

        ip_hdr_len_offset = (ord(ip_pkt[0]) & 0x0f) * 4
        ip_payload = ip_pkt[ip_hdr_len_offset:len(ip_pkt)]

        pkt_src_port, pkt_dst_port = struct.unpack('!HH', ip_payload[0:4])
        five_tuple = FiveTuple(pkt_protocol, pkt_src_addr, pkt_src_port, pkt_dst_addr, pkt_dst_port)
        five_tuple_id = str(five_tuple)

        if pkt_protocol == IPPROTO_UDP:
            udp_payload_len = struct.unpack('!H', ip_payload[4:6])[0] - 8
            udp_header = ip_payload[0:8]
            udp_payload = ip_payload[8:len(ip_payload)]
            udp_segment = UdpSegment(five_tuple, udp_header, udp_payload)
            process_udp_segment(builder, udp_segment)
        elif pkt_protocol == IPPROTO_TCP:
            tcp_hdr_len = (ord(ip_payload[12]) >> 4) * 4
            tcp_header = ip_payload[0:tcp_hdr_len]
            tcp_payload = ip_payload[tcp_hdr_len:len(ip_payload)]
            segment = TcpSegment(five_tuple, tcp_header, tcp_payload)
            process_tcp_segment(builder, segment)

    #
    # Having read the contents of the pcap, we fill the database with any
    # remaining TCP and UDP segments
    #
    for tcp_stream in tcp_streams.itervalues():
        db_add_tcp_stream_segments(builder, tcp_stream)

    for udp_stream in udp_streams.itervalues():
        db_add_udp_stream_segments(builder, udp_stream)

    #
    # We've finished with the database
    #
    builder.finish()