def processQueries(self): self.rel_path = os.path.dirname(os.path.abspath(__file__)) filename = self.rel_path+self.query_file_name self.rawQueries = collections.OrderedDict() with open(filename, 'r', encoding="utf-8") as file: data=file.read() soup = BeautifulSoup(data, "html.parser") content = soup.findAll("doc") for entry in content: query_id_tag = entry.find("docno") queryid= query_id_tag.getText().strip() query_id_tag.extract() query = entry.getText().strip() self.rawQueries[queryid] = query self.queries = collections.OrderedDict() corpusbuilder = CorpusBuilder() for query_id in self.rawQueries.keys(): query_text = self.rawQueries[query_id] if(self.stopping_required): query_text_list = query_text.split() s_query= self.remove_stopWords(self.stopwords_list, query_text_list) query_text = ''.join(s_query[0:len(s_query)-1]) content ="" content = corpusbuilder.punctuation_handler(content, self.rawQueries[query_id]) self.queries[query_id]=content
def __init__(self, input_path, output_folder, is_verbose, foreground, self_bg, mel_bg, harm_bg, uses_mel_from_json, uses_held_melodics, uses_legacy_parser): self.logger = Main.init_logger(is_verbose) if not os.path.isabs(input_path): input_path = os.path.normpath(os.getcwd() + '/' + input_path) self.logger.debug( 'Script was initialized with the following parameters:\n' + settings.DEBUG_INDENT + 'Input file/folder: {0}\n'.format(input_path) + settings.DEBUG_INDENT + 'Output folder: {0}\n'.format(output_folder) + settings.DEBUG_INDENT + 'Foreground channel(s): {0}\n'.format(foreground) + settings.DEBUG_INDENT + 'Self Background channel(s): {0}\n'.format(self_bg) + settings.DEBUG_INDENT + 'Melodic Background channel(s): {0}\n'.format(mel_bg) + settings.DEBUG_INDENT + 'Harmonic Background channel(s): {0}\n'.format(harm_bg)) # NOTE! This primitive check will print a warning if the user is building the json file to a folder that is not # named `corpus`. If the naming of the corpus folder will change in future versions, # this check will be incorrect and must be corrected. if os.path.normpath(os.path.basename( output_folder)) != settings.CORPUS_FOLDER_NAME: self.logger.warn( 'Output folder is not set to default and will likely not be available inside SoMax, ' 'is this intentional?\n' 'To ensure correct behaviour, please either run the script directly inside\n' 'the corpus folder of SoMax or use the -o option to point to this directory.' ) builder = CorpusBuilder(input_path, foreground_channels=foreground, self_bg_channels=self_bg, mel_bg_channels=mel_bg, harm_bg_channels=harm_bg, uses_legacy_parser=uses_legacy_parser) # Build the corpus and write all the files (standard, harmonic and melodic) output_filepaths = builder.build_corpus( os.path.normpath(output_folder) + '/') log_string = "The following files were written:" for fp in output_filepaths: log_string += '\n' + settings.INFO_INDENT + fp self.logger.info(log_string) # Overwrite the generated melodic json file if flag is set if uses_mel_from_json: self.generate_mel_from_json(output_filepaths, uses_held_melodics)
def runCorpusBuilder(self, is_stemmed_corpus): corpusBuilder = CorpusBuilder() if (is_stemmed_corpus): corpusBuilder.create_stemmed_corpus_files() else: corpusBuilder.initialize_doc_keys() corpusBuilder.create_corpus()
def processStemmedQueries(self): self.rel_path = os.path.dirname(os.path.abspath(__file__)) filename = self.rel_path+self.stemmed_query_file self.rawQueries = collections.OrderedDict() with open(filename, 'r', encoding="utf-8") as file: lines=file.read().splitlines() for query_id,query in enumerate(lines): self.rawQueries[str(query_id+1)] = query self.queries = collections.OrderedDict() corpusbuilder = CorpusBuilder() for query_id in self.rawQueries.keys(): content ="" content = corpusbuilder.punctuation_handler(content, self.rawQueries[query_id]) self.queries[query_id]=content
def buildCorpus(outFN, block_size, stream_size, text_ids): if len(text_ids) == 0: print >>sys.stderr, "Must provide at least one input ID" sys.exit(0) builder = CorpusBuilder(outFN) total_bytes = 0 stream_id = 0 stream_bytes = 0 for text_id in text_ids: text_id = int(text_id) text = gutenberg.acquire.load_etext(text_id) text = gutenberg.cleanup.strip_headers(text).strip() addBlocks(builder, block_size, stream_size, text_id, text) total_bytes += len(text) builder.finish() print "Total:", total_bytes, "bytes."
def lineCorpus(inFN, outFN): ''' Read lines from file name @inFN and write them as blocks to a new db with name @outFN. ''' if not os.path.exists(inFN): print >> sys.stderr, "Input file '%s' does not exist. Exiting." % outFN sys.exit(-1) lines = open(inFN).readlines() if len(lines) == 0: print >> sys.stderr, "Input file contained no lines. Exiting." sys.exit(0) builder = CorpusBuilder(outFN) # write a single stream to contain everything streamId = 0 for l in lines: builder.add_chunk(streamId, l.rstrip()) builder.finish()
from CorpusBuilder import CorpusBuilder from ops import OpSomaxMelodic corpus_path = "./examples/keith.mid" builder = CorpusBuilder(corpus_path, corpus_name="keith_rh") print builder.ops standardOp = builder.ops[''][0] standardOp.fgChannels = [1] standardOp.bgChannels = [2] MelOp = OpSomaxMelodic(standardOp.file_paths, standardOp.corpus_name) MelOp.fgChannels = [1] builder.ops['m'] = (MelOp, builder.ops[''][1]) builder.build_corpus("./examples/output/") corpus_path = "./examples/keith.mid" builder = CorpusBuilder(corpus_path, corpus_name="keith_lh") standardOp = builder.ops[''][0] standardOp.fgChannels = [2] standardOp.bgChannels = [1] MelOp = OpSomaxMelodic(standardOp.file_paths, standardOp.corpus_name) MelOp.fgChannels = [1] builder.ops['m'] = (MelOp, builder.ops[''][1]) builder.build_corpus("./examples/output/")
from CorpusBuilder import CorpusBuilder from ops import OpSomaxMelodic, OpSomaxHarmonic corpus_path = "./examples/debussy.mid" # we build the CorpusBuilder object with the path of the corpus builder = CorpusBuilder(corpus_path) standardOp = builder.ops[''][0] print builder.ops # setting the foreground channels as in the interactive mode standardOp.fgChannels = [2, 3, 4] standardOp.bgChannels = [1, 2, 3, 4] #adding an operation to the standard built one # MelOp = OpSomaxMelodic(standardOp.file_paths, standardOp.corpus_name) # MelOp.fgChannels = [1] # builder.ops['m']= (MelOp, builder.ops[''][1]) # print "1: ", standardOp.file_paths # print "2: ", standardOp.corpus_name # print "3: ", builder.ops[''][1] # HarmOp = OpSomaxHarmonic(standardOp.file_paths, standardOp.corpus_name) # HarmOp.fgChannels = [1] # builder.ops['h']= (HarmOp, builder.ops[''][1]) print builder.ops builder.build_corpus("./examples/output/")
from CorpusBuilder import CorpusBuilder from ops import OpSomaxMelodic corpus_path = "./debussy" # we build the CorpusBuilder object with the path of the corpus builder = CorpusBuilder(corpus_path) StandardOp = builder.ops[''][0] print builder.ops # setting the foreground channels as in the interactive mode #StandardOp.setParameter('fgChannels', '2 3 4') #StandardOp.setParameter('bgChannels', '1 2 3 4') # or directly into the attributes of the Op object StandardOp.fgChannels = [2, 3, 4] StandardOp.bgChannels = [1, 2, 3, 4] HarmOp = builder.ops['h'][0] HarmOp.fgChannels = [1, 2, 3, 4] HarmOp.bgChannels = [1, 2, 3, 4] #adding an operation to the standard built one MelOp = OpSomaxMelodic(StandardOp.file_paths, StandardOp.corpus_name) MelOp.fgChannels = [1] MelOp.bgChannels = [1, 2, 3, 4] builder.ops['m'] = (MelOp, builder.ops[''][1]) print builder.ops builder.build_corpus("./../corpus/")
from CorpusBuilder import CorpusBuilder from ops import OpSomaxMelodic corpus_path = "./examples/debussy.mid" # we build the CorpusBuilder object with the path of the corpus builder = CorpusBuilder(corpus_path) standardOp = builder.ops[''][0] print builder.ops # setting the foreground channels as in the interactive mode standardOp.fgChannels = [2,3,4] standardOp.bgChannels = [1,2,3,4] #adding an operation to the standard built one MelOp = OpSomaxMelodic(standardOp.file_paths, standardOp.corpus_name) MelOp.fgChannels = [1] builder.ops['m']= (MelOp, builder.ops[''][1]) print builder.ops builder.build_corpus("./examples/output/")
def enchunk_pcap(pcapFN, sqliteFN): """Read the contents of a pcap file with name @pcapFN and produce a sqlite db with name @sqliteFN. It will contain chunks of data from TCP and UDP streams, """ if not os.path.exists(pcapFN): print >> sys.stderr, "Input file '%s' does not exist. Exiting." % pcapFN sys.exit(-1) builder = CorpusBuilder(sqliteFN) # # Read in the contents of the pcap file, adding stream segments as found # pkt_cnt = 0 ip_pkt_cnt = 0 unsupported_ip_protocol_cnt = 0 pcap_ref = pcap.pcap(pcapFN) done = False while not done: try: ts, packet = pcap_ref.next() except: break pkt_cnt += 1 linkLayerType = struct.unpack( '!H', packet[(pcap_ref.dloff - 2):pcap_ref.dloff])[0] if linkLayerType != ETHERTYPE_IP: # # We're only interested in IP packets # continue ip_pkt_cnt += 1 ip_pkt_total_len = struct.unpack( '!H', packet[pcap_ref.dloff + 2:pcap_ref.dloff + 4])[0] ip_pkt = packet[pcap_ref.dloff:pcap_ref.dloff + ip_pkt_total_len] pkt_protocol = struct.unpack('B', ip_pkt[9])[0] if (pkt_protocol != IPPROTO_UDP) and (pkt_protocol != IPPROTO_TCP): # # we're only interested in UDP and TCP packets at the moment # continue pkt_src_addr = inet_ntoa(ip_pkt[12:16]) pkt_dst_addr = inet_ntoa(ip_pkt[16:20]) ip_hdr_len_offset = (ord(ip_pkt[0]) & 0x0f) * 4 ip_payload = ip_pkt[ip_hdr_len_offset:len(ip_pkt)] pkt_src_port, pkt_dst_port = struct.unpack('!HH', ip_payload[0:4]) five_tuple = FiveTuple(pkt_protocol, pkt_src_addr, pkt_src_port, pkt_dst_addr, pkt_dst_port) five_tuple_id = str(five_tuple) if pkt_protocol == IPPROTO_UDP: udp_payload_len = struct.unpack('!H', ip_payload[4:6])[0] - 8 udp_header = ip_payload[0:8] udp_payload = ip_payload[8:len(ip_payload)] udp_segment = UdpSegment(five_tuple, udp_header, udp_payload) process_udp_segment(builder, udp_segment) elif pkt_protocol == IPPROTO_TCP: tcp_hdr_len = (ord(ip_payload[12]) >> 4) * 4 tcp_header = ip_payload[0:tcp_hdr_len] tcp_payload = ip_payload[tcp_hdr_len:len(ip_payload)] segment = TcpSegment(five_tuple, tcp_header, tcp_payload) process_tcp_segment(builder, segment) # # Having read the contents of the pcap, we fill the database with any # remaining TCP and UDP segments # for tcp_stream in tcp_streams.itervalues(): db_add_tcp_stream_segments(builder, tcp_stream) for udp_stream in udp_streams.itervalues(): db_add_udp_stream_segments(builder, udp_stream) # # We've finished with the database # builder.finish()
import sys, os, re, importlib, inspect, logging, argparse from CorpusBuilder import CorpusBuilder if __name__ == "__main__": if len(sys.argv) == 1: raise Exception("not enough arguments to the script!") elif len(sys.argv) == 2: corpus_path = sys.argv[1] builder = CorpusBuilder(corpus_path) builder.build_corpus("./") elif len(sys.argv) == 3: option = sys.argv[1] # interactive mode if option == "-i": corpus_path = sys.argv[2] builder = CorpusBuilder(corpus_path, verbose=True) print "" print "Do you want to modify an operation? (h for help)" cont = 1 while cont: ans = raw_input("? ") if ans == '': cont = 0 elif ans == 'h': print "type o to re-print all the operations" print "type p <extension> to list the operation's paramaters of the given extension" print "type r <extension> <op> to replace an operation for the given extension" print "type s <extension> <parameter> <value> to change an operation parameter" # print "type r <file> to remove a file in the corpus" elif ans == 'o':
def enchunk_pcap(pcapFN, sqliteFN): """Read the contents of a pcap file with name @pcapFN and produce a sqlite db with name @sqliteFN. It will contain chunks of data from TCP and UDP streams, """ if not os.path.exists(pcapFN): print >> sys.stderr, "Input file '%s' does not exist. Exiting." % pcapFN sys.exit(-1) builder = CorpusBuilder(sqliteFN) # # Read in the contents of the pcap file, adding stream segments as found # pkt_cnt = 0; ip_pkt_cnt = 0; unsupported_ip_protocol_cnt = 0 pcap_ref = pcap.pcap(pcapFN) done = False while not done: try: ts, packet = pcap_ref.next() except: break pkt_cnt += 1 linkLayerType = struct.unpack('!H', packet[(pcap_ref.dloff - 2):pcap_ref.dloff])[0] if linkLayerType != ETHERTYPE_IP: # # We're only interested in IP packets # continue ip_pkt_cnt += 1 ip_pkt_total_len = struct.unpack('!H', packet[pcap_ref.dloff + 2: pcap_ref.dloff + 4])[0] ip_pkt = packet[pcap_ref.dloff:pcap_ref.dloff + ip_pkt_total_len] pkt_protocol = struct.unpack('B', ip_pkt[9])[0] if (pkt_protocol != IPPROTO_UDP) and (pkt_protocol != IPPROTO_TCP): # # we're only interested in UDP and TCP packets at the moment # continue pkt_src_addr = inet_ntoa(ip_pkt[12:16]) pkt_dst_addr = inet_ntoa(ip_pkt[16:20]) ip_hdr_len_offset = (ord(ip_pkt[0]) & 0x0f) * 4 ip_payload = ip_pkt[ip_hdr_len_offset:len(ip_pkt)] pkt_src_port, pkt_dst_port = struct.unpack('!HH', ip_payload[0:4]) five_tuple = FiveTuple(pkt_protocol, pkt_src_addr, pkt_src_port, pkt_dst_addr, pkt_dst_port) five_tuple_id = str(five_tuple) if pkt_protocol == IPPROTO_UDP: udp_payload_len = struct.unpack('!H', ip_payload[4:6])[0] - 8 udp_header = ip_payload[0:8] udp_payload = ip_payload[8:len(ip_payload)] udp_segment = UdpSegment(five_tuple, udp_header, udp_payload) process_udp_segment(builder, udp_segment) elif pkt_protocol == IPPROTO_TCP: tcp_hdr_len = (ord(ip_payload[12]) >> 4) * 4 tcp_header = ip_payload[0:tcp_hdr_len] tcp_payload = ip_payload[tcp_hdr_len:len(ip_payload)] segment = TcpSegment(five_tuple, tcp_header, tcp_payload) process_tcp_segment(builder, segment) # # Having read the contents of the pcap, we fill the database with any # remaining TCP and UDP segments # for tcp_stream in tcp_streams.itervalues(): db_add_tcp_stream_segments(builder, tcp_stream) for udp_stream in udp_streams.itervalues(): db_add_udp_stream_segments(builder, udp_stream) # # We've finished with the database # builder.finish()