def main(args): p = argparse.ArgumentParser(description=__doc__) p.add_argument("-n", "--lines",default=10, type=int, help="""print the first K lines instead of 10; if negative, print the last -K lines""") p.add_argument("-q", "--quiet", "--silent", action='store_true', help="never print headers for each file") p.add_argument("-v", "--verbose", action='store_true', help="always print headers for each file") p.add_argument("files", action="store", nargs="*", help="files to print") ns = p.parse_args(args) status = 0 header_fmt = '==> {} <==\n' if len(ns.files) == 0: ns.files = ['-'] try: for fname in ns.files: if ns.verbose or (len(ns.files) > 1 and not ns.quiet): if fname == '-': print(header_fmt.format('standard input'), end='') else: print(header_fmt.format(fname), end='') fileinput.close() inp = fileinput.input(fname, openhook=fileinput.hook_encoded("utf-8")) if ns.lines >= 0: buf = [] for i, line in enumerate(inp): if i >= ns.lines: break buf.append(line) for line in buf: print(line, end='') else: buf = [] for line in fileinput.input(inp, openhook=fileinput.hook_encoded("utf-8")): buf.append(line) if len(buf) > -ns.lines: del buf[0] for line in buf: print(line, end='') except Exception as e: print('head :%s' % str(e)) status = 1 finally: fileinput.close() sys.exit(status)
def main(): parser = arg_parser() args = parser.parse_args() dir = args.onionshare_dir src = files_in(dir, 'onionshare') + files_in(dir, 'onionshare_gui') pysrc = [p for p in src if p.endswith('.py')] htmlsrc = [p for p in src if p.endswith('.html')] translate_keys = set() # load translate key from python source for line in fileinput.input(pysrc, openhook=fileinput.hook_encoded('utf-8')): # search `strings._('translate_key')` # `strings._('translate_key', True)` m = re.search(r'strings\._\((.*?)\)', line) if m: arg = m.group(1) key = arg.split(',')[0].strip('''"' ''') translate_keys.add(key) # load translate key from html source for line in fileinput.input(htmlsrc, openhook=fileinput.hook_encoded('utf-8')): # search `{{strings.translate_key}}` m = re.search(r'{{.*strings\.([-a-zA-Z0-9_]+).*}}', line) if m: key = m.group(1) translate_keys.add(key) if args.show_all_keys: for k in sorted(translate_keys): print k sys.exit() locale_files = [f for f in files_in(dir, 'locale') if f.endswith('.json')] for locale_file in locale_files: with codecs.open(locale_file, 'r', encoding='utf-8') as f: trans = json.load(f) # trans -> {"key1": "translate-text1", "key2": "translate-text2", ...} locale_keys = set(trans.keys()) disused = locale_keys - translate_keys lacked = translate_keys - locale_keys locale, ext = os.path.splitext(os.path.basename(locale_file)) for k in sorted(disused): print locale, 'disused', k for k in sorted(lacked): print locale, 'lacked', k
def test_file_opening_hook(self): try: # cannot use openhook and inplace mode fi = FileInput(inplace=1, openhook=lambda f,m: None) self.fail("FileInput should raise if both inplace " "and openhook arguments are given") except ValueError: pass try: fi = FileInput(openhook=1) self.fail("FileInput should check openhook for being callable") except ValueError: pass if due_to_ironpython_incompatibility("functionality in cpython site.py"): # without it, lookup('rot13') will fail due to lack of search functions # which was registered in encodings\__init__.py import encodings if not due_to_ironpython_bug('http://tkbgitvstfat01:8080/WorkItemTracking/WorkItem.aspx?artifactMoniker=148925'): try: t1 = writeTmp(1, ["A\nB"], mode="wb") fi = FileInput(files=t1, openhook=hook_encoded("rot13")) lines = list(fi) self.assertEqual(lines, ["N\n", "O"]) finally: remove_tempfiles(t1)
def main(args): ap = argparse.ArgumentParser() ap.add_argument('files', nargs='*', help='files to sort') ap.add_argument('-r', '--reverse', action='store_true', default=False, help='reverse the result of comparisons') ns = ap.parse_args(args) def _print(lines): if lines is not None: lines = sorted(lines) if ns.reverse: lines = lines[::-1] print(''.join(lines)) fileinput.close() # in case it is not closed try: lines = None for line in fileinput.input(ns.files, openhook=fileinput.hook_encoded("utf-8")): if fileinput.isfirstline(): _print(lines) lines = [] lines.append(line) _print(lines) finally: fileinput.close()
def get_preprocessed_text(self, limit=None): """ Generator generates preprocessed list of tokenized words on every call. - Read Sentence tokenized intermediate preprocessed file. - Tokenize and preprocess words, return list of words from a sentence. """ count = 0 if limit is None: limit = self.limit for sentence in fileinput.input( files=[self.preprocessed_corpus_path], openhook=fileinput.hook_encoded(self.encoding) ): word_list = itertools.chain(*( self._clean_word( word ) for word in self._tokenize_words(sentence) )) word_list = [ word for word in word_list if len(word) is not 0 ] count += len(word_list) if limit is not None and count >= limit: fileinput.close() raise StopIteration else: yield word_list
def parseTag(inputfilename, outputfilename, searchExp): fin = fileinput.input(inputfilename, inplace = 0, openhook = fileinput.hook_encoded(fileencoding)) fout = codecs.open(outputfilename, "w", fileencoding) isblock = 0 for line in fin: newline = line isfirst = searchExp in line islast = "\tMedium;" in line issingleline = isfirst and islast # and "," in line fixquotes = 0 if issingleline: fixquotes = "\t" in extractThirdField(line) # If there is a comma on the third fild, quote it! if fixquotes: newline = leftQuoteThirdField(line) newline = rightQuoteThirdField(newline) print "%d: %s" % (fileinput.filelineno(), newline) # print "%d:(issingle):%s" % (fileinput.filelineno(), newline) if (not issingleline) and (isfirst and not islast): #newline = reverseReplace(line, searchExp, searchExp + '"', 1) newline = leftQuoteThirdField(line) print "quoting left" isblock = 1 if (not issingleline) and (not isfirst and islast and isblock): newline = reverseReplace(line, "\tMedium;", '"' + "\tMedium;", 1) print "quoting right" isblock = 0 #TODO: Fix the single line comma bug fout.write(newline) if issingleline: print "%d: %s" % (fileinput.filelineno(), newline) fout.close()
def run(target_dir, inplaceFlag=0): global showMessageCount, alreadyChanged, showMessageChanged for root, dirs, files in os.walk(target_dir): for file in files: if file.endswith(".jsp") and (file.lower() in teoconstants.jsps): if inplaceFlag == 0: # improve performance f = fileinput.input( root + "\\" + file, inplace=inplaceFlag, openhook=fileinput.hook_encoded("utf-8") ) elif inplaceFlag == 1: f = fileinput.input(root + "\\" + file, inplace=inplaceFlag) for i, line in enumerate(f): if re.search("posui:showImageButtons", line, re.IGNORECASE): showMessageCount += 1 if re.search("isMultiLang", line): alreadyChanged += 1 else: showMessageChanged += 1 line = line.replace("posui:showMessage", 'posui:showMessage isMultiLang="true"') if inplaceFlag == 0: sys.stdout.write(file + " : " + line) if inplaceFlag == 1: sys.stdout.write(line) f.close()
def uniq_files(path_list): res = set() for line in fileinput.FileInput(path_list, openhook=fileinput.hook_encoded( "utf-8")): res.add(line.strip()) return sorted(list(res))
def read_csv_file(source_path_param): list1 = [] for line in fileinput.input([source_path_param], openhook=fileinput.hook_encoded("utf8")): new_line_index = line.find('\n') # print(new_line_index) list1.append(line[:new_line_index]) fileinput.close() return list1
def read_channelconfig(config_file): channels = dict() fileinput.hook_encoded("utf-8") file = open(config_file, mode='r', encoding='utf-8') while True: line = file.readline().lstrip().rstrip('\n') if line is None or line == '': break if not line.startswith("#"): regex = re.compile('\[([^\]]*)\]\s*([A-Za-z0-9_]+)') match = regex.match(line) if match: channelname = match.group(1) channelvalue = match.group(2) channel_key = channelname if not (channelname == '' or channelname is None) else channelvalue channels["{channelName}".format(channelName=channel_key)] = channelvalue del match del regex return channels
def main(args): ap = argparse.ArgumentParser() ap.add_argument('file', nargs='*', help='one or more files to be copied') ns = ap.parse_args(args) fileinput.close() # in case it is not closed try: clipboard.set(''.join(line for line in fileinput.input(ns.file, openhook=fileinput.hook_encoded("utf-8")))) except Exception as err: print("pbcopy: {}: {!s}".format(type(err).__name__, err), file=sys.stderr) finally: fileinput.close()
def findStringInListOfFiles(self, source_dir, fileList, string, regex=0): for root, dirs, files in os.walk(source_dir): for file in files: if file.lower() in fileList: f = fileinput.input(root+"\\"+ file, inplace=0, openhook=fileinput.hook_encoded('utf-8')) for i, line in enumerate(f): if regex == 0: if line.find(string) != -1: sys.stdout.write(file +':'+str(i)+line) else: if re.search(string, line): sys.stdout.write(file +':'+str(i)+'\t'+line)
def main(args): global _stash ap = argparse.ArgumentParser() ap.add_argument('pattern', help='the pattern to match') ap.add_argument('files', nargs='*', help='files to be searched') ap.add_argument('-i', '--ignore-case', action='store_true', help='ignore case while searching') ap.add_argument('-v', '--invert', action='store_true', help='invert the search result') ap.add_argument('-c', '--count', action='store_true', help='count the search results instead of normal output') ns = ap.parse_args(args) flags = 0 if ns.ignore_case: flags |= re.IGNORECASE pattern = re.compile(ns.pattern, flags=flags) # Do not try to grep directories files = [f for f in ns.files if not os.path.isdir(f)] fileinput.close() # in case it is not closed try: counts = collections.defaultdict(int) for line in fileinput.input(files, openhook=fileinput.hook_encoded("utf-8")): if bool(pattern.search(line)) != ns.invert: if ns.count: counts[fileinput.filename()] += 1 else: if ns.invert: # optimize: if ns.invert, then no match, so no highlight color needed newline = line else: newline = re.sub(pattern, lambda m: _stash.text_color(m.group(), 'red'), line) if fileinput.isstdin(): fmt = u'{lineno}: {line}' else: fmt = u'{filename}: {lineno}: {line}' print(fmt.format(filename=fileinput.filename(), lineno=fileinput.filelineno(), line=newline.rstrip())) if ns.count: for filename, count in counts.items(): fmt = u'{count:6} {filename}' print(fmt.format(filename=filename, count=count)) except Exception as err: print("grep: {}: {!s}".format(type(err).__name__, err), file=sys.stderr) finally: fileinput.close()
def main(): parser = arg_parser() args = parser.parse_args() dir = args.onionshare_dir src = files_in(dir, 'onionshare') + \ files_in(dir, 'onionshare_gui') + \ files_in(dir, 'onionshare_gui/mode') + \ files_in(dir, 'onionshare_gui/mode/share_mode') + \ files_in(dir, 'onionshare_gui/mode/receive_mode') + \ files_in(dir, 'install/scripts') + \ files_in(dir, 'tests') pysrc = [p for p in src if p.endswith('.py')] lang_code = args.lang_code translate_keys = set() # load translate key from python source for line in fileinput.input(pysrc, openhook=fileinput.hook_encoded('utf-8')): # search `strings._('translate_key')` # `strings._('translate_key', True)` m = re.findall(r'strings\._\((.*?)\)', line) if m: for match in m: key = match.split(',')[0].strip('''"' ''') translate_keys.add(key) if args.show_all_keys: for k in sorted(translate_keys): print(k) sys.exit() if lang_code == 'all': locale_files = [f for f in files_in(dir, 'share/locale') if f.endswith('.json')] else: locale_files = [f for f in files_in(dir, 'share/locale') if f.endswith('%s.json' % lang_code)] for locale_file in locale_files: with codecs.open(locale_file, 'r', encoding='utf-8') as f: trans = json.load(f) # trans -> {"key1": "translate-text1", "key2": "translate-text2", ...} locale_keys = set(trans.keys()) disused = locale_keys - translate_keys lacked = translate_keys - locale_keys locale, ext = os.path.splitext(os.path.basename(locale_file)) for k in sorted(disused): print(locale, 'disused', k) for k in sorted(lacked): print(locale, 'lacked', k)
def func(input_dir, file_reg, func, args=None, *, encoding='utf-8'): """Run func for every line in the files matching file_reg in input_dir.""" with fileinput.input(glob.glob(input_dir + file_reg), openhook=fileinput.hook_encoded(encoding)) as f: for line in f: try: if args: func(line, *args) else: func(line) except Exception: print(line) raise
def _get_word_from_file(path): first = True for root, dirs, files in os.walk(path): with FileInput(files=_get_full_paths(root, files), openhook=fileinput.hook_encoded(DEFAULT_ENCODING)) as f: for line in f: if f.isfirstline(): if first: first = False else: yield EOD() for word in line.split(): yield word yield EOD()
def Convert(description=u'', output=u'sys.stdout', input=u'sys.stdin'): """The main loop routine in charge to read the data and to report results. Args: description: todo. output: the output channel to be used for the results. input: the input channel to be used to feed l2tcsv data. """ cybox_files = {} cybox_files_related = {} rows = [] openhook = fileinput.hook_encoded(u'utf8') file_in = fileinput.FileInput(input, openhook=openhook) try: reader = csv.DictReader(file_in, fieldnames=L2TCSV_HEADER) # Check if input file or stdin has l2tcsv headers. first_row = reader.next() if first_row[u'date'] != u'date' and first_row[u'extra'] != u'extra': EventToCybox(first_row, cybox_files, cybox_files_related) # Process lines, one-step over data without memory. for row in reader: EventToCybox(row, cybox_files, cybox_files_related) except IOError as exception_io: logging.error(u'IO error: {0:s}'.format(exception_io)) return observables = cyboxObservables() # Actually hard coded. tool = cyboxTools.ToolInformation(u'Plaso') tool.version = u'1.4.1' tool_list = cyboxTools.ToolInformationList() tool_list.append(tool) observables.observable_package_source = cyboxMeasureSource.MeasureSource() observables.observable_package_source.tools = tool_list for key, cybox_file in cybox_files.iteritems(): observables.add(cyboxObservable(cybox_file)) try: if output != u'sys.stdout': file_out = open(output, u'w') else: file_out = sys.stdout file_out.write(observables.to_xml().encode(u'utf8')) except IOError as exception_io: logging.error(u'IO error: {0:s}'.format(exception_io))
def process_data(source_path_param, output, choice): for line in fileinput.input([source_path_param], openhook=fileinput.hook_encoded("utf8")): value = line.split(',') # print(value) x = int(value[12].strip()) # hour y = 0.0 if choice == 'retweet': y = float(value[3].strip()) # diff_retweet elif choice == 'follower_wt_mc': y = float(value[10].strip()) # diff_follower_wt_mc elif choice == 'follower_wo_mc': y = float(value[13].strip()) # diff_follower_wo_mc output[x].append(y) fileinput.close() return
def concat(input_dir, file_reg, output, *, encoding='utf-8', fltr=None): """Concatenate all files matching file_reg in input_dir into output.""" with fileinput.input(glob.glob(input_dir + file_reg), openhook=fileinput.hook_encoded(encoding)) as f, \ open(output, 'w', encoding=encoding) as o: for line in f: try: if fltr: if re.search(fltr, line): o.write(line) else: o.write(line) except Exception: print(line) raise
def generate_game(difficulty, size=6, num=1): words = [] for line in fileinput.input(files=dicts[difficulty],openhook=fileinput.hook_encoded("iso-8859-1")): word = line.strip().upper() if len(word) > 2: words.append(word) while True: scramble = generate_scramble(size) letter_count = make_letter_count(scramble) soln = [] for word in words: if can_make_word(letter_count, word): soln.append(word) if len(soln) > 9: return (scramble, soln)
def doReadBigramLattice(filename, bigramLat): starttime = time.time() ft1 = bigram.BIGRAM_START filegen = fileinput.input(filename, openhook = fileinput.hook_encoded('utf8')) for ln in filegen: ft2 = readFeatureTag(ln) bg = bigram(ft1, ft2) bigramLat.addItem(bg) ft1 = ft2 ft2 = bigram.BIGRAM_END bg = bigram(ft1, ft2) bigramLat.addItem(bg) endtime = time.time(); elapsetime = endtime - starttime print("Read {0} file in {1} seconds. {2} items".format(filename, elapsetime, bigramLat.getN())) return elapsetime
def test_readline(self): with open(TESTFN, 'wb') as f: f.write(b'A\nB\r\nC\r') # Fill TextIOWrapper buffer. f.write(b'123456789\n' * 1000) # Issue #20501: readline() shouldn't read whole file. f.write(b'\x80') self.addCleanup(safe_unlink, TESTFN) with FileInput(files=TESTFN, openhook=hook_encoded('ascii'), bufsize=8) as fi: self.assertEqual(fi.readline(), 'A\n') self.assertEqual(fi.readline(), 'B\n') self.assertEqual(fi.readline(), 'C\n') with self.assertRaises(UnicodeDecodeError): # Read to the end of file. list(fi)
def extract_diff_ret_or_fol(source_path_param, choose_str): list1 = [] for line in fileinput.input([source_path_param], openhook=fileinput.hook_encoded("utf8")): if choose_str == 'retweet': diff_ret = line.split(',')[3] dot_index = diff_ret.find(".") diff_ret_int = int(diff_ret[0:dot_index]) list1.append(diff_ret_int) # Normal elif choose_str == 'follower_wt_mc': diff_fol = float(line.split(',')[10]) list1.append(diff_fol) # Normal elif choose_str == 'follower_wo_mc': diff_fol = float(line.split(',')[13]) list1.append(diff_fol) # Normal fileinput.close() return list1
def main(self, target_dir): for root, dirs, files in os.walk(target_dir): for file in files: if file.endswith(".jsp") and (file.lower() in teoconstants.uipgms): f = fileinput.input(root + "\\" + file, inplace=0, openhook=fileinput.hook_encoded("utf-8")) for i, line in enumerate(f): iterator = re.finditer(r"^.*(?P<imagename>\b.+\.gif\b)", line, re.IGNORECASE) for match in iterator: ## print(file+':'+str(i+1)+'\t'+match.group('imagename')) if match.group("imagename") not in imageList: imageList.append(match.group("imagename")) if imageList: for value in imageList: ## print(commonJobs.printFilenameAndPackage(file,root,'public_html.*',r'\.jsp')) print(file + "\t" + value) imageList[:] = []
def more(filenames, pagesize=10, clear=False, fmt='{line}'): '''Display content of filenames pagesize lines at a time (cleared if specified) with format fmt for each output line''' fileinput.close() # in case still open try: pageno = 1 if clear: clear_screen() for line in fileinput.input(filenames, openhook=fileinput.hook_encoded("utf-8")): lineno, filename, filelineno = fileinput.lineno(), fileinput.filename(), fileinput.filelineno() print(fmt.format(**locals()), end='') if pagesize and lineno % pagesize == 0: console.alert('Abort or continue', filename, 'Next page') # TODO: use less intrusive mechanism than alert pageno += 1 if clear: clear_screen() finally: fileinput.close()
def test_readline(self): with open(TESTFN, 'wb') as f: f.write('A\nB\r\nC\r') # Fill TextIOWrapper buffer. f.write('123456789\n' * 1000) # Issue #20501: readline() shouldn't read whole file. f.write('\x80') self.addCleanup(safe_unlink, TESTFN) fi = FileInput(files=TESTFN, openhook=hook_encoded('ascii'), bufsize=8) # The most likely failure is a UnicodeDecodeError due to the entire # file being read when it shouldn't have been. self.assertEqual(fi.readline(), u'A\n') self.assertEqual(fi.readline(), u'B\r\n') self.assertEqual(fi.readline(), u'C\r') with self.assertRaises(UnicodeDecodeError): # Read to the end of file. list(fi) fi.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--output', '-o', help='write output to file instead of stdout') parser.add_argument('--split', '-s', help='if writing to file, split into multiple files with this many lines per ' 'file', type=int, default=0) parser.add_argument('--extra-field', '-e', help='extra fields to include. Provide a field name and a pointer to ' 'the field. Example: -e verified user.verified', nargs=2, action='append') parser.add_argument('--excel', '-x', help='create file compatible with Excel', action='store_true') parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used') args = parser.parse_args() file_count = 1 csv_file = None if args.output: if args.split: csv_file = codecs.open(numbered_filepath(args.output, file_count), 'wb', 'utf-8') file_count += 1 else: csv_file = codecs.open(args.output, 'wb', 'utf-8') else: csv_file = sys.stdout sheet = csv.writer(csv_file) extra_headings = [] extra_fields = [] if args.extra_field: for heading, field in args.extra_field: extra_headings.append(heading) extra_fields.append(field) sheet.writerow(get_headings(extra_headings=extra_headings)) files = args.files if len(args.files) > 0 else ('-',) for count, line in enumerate(fileinput.input(files, openhook=fileinput.hook_encoded("utf-8"))): if args.split and count and count % args.split == 0: csv_file.close() csv_file = codecs.open(numbered_filepath(args.output, file_count), 'wb', 'utf-8') sheet = csv.writer(csv_file) sheet.writerow(get_headings(extra_headings=extra_headings)) file_count += 1 tweet = json.loads(line) sheet.writerow(get_row(tweet, extra_fields=extra_fields, excel=args.excel))
def test_file_opening_hook(self): try: # cannot use openhook and inplace mode fi = FileInput(inplace=1, openhook=lambda f, m: None) self.fail("FileInput should raise if both inplace " "and openhook arguments are given") except ValueError: pass try: fi = FileInput(openhook=1) self.fail("FileInput should check openhook for being callable") except ValueError: pass try: t1 = writeTmp(1, ["A\nB"], mode="wb") fi = FileInput(files=t1, openhook=hook_encoded("rot13")) lines = list(fi) self.assertEqual(lines, ["N\n", "O"]) finally: remove_tempfiles(t1)
def load_data(files): """ Extract zip and process information into CSV's. Parameters ---------- files : list of str Returns ------- str : combined data from files """ log.info('Loading data: %s.' % ', '.join(files)) raw_data = fileinput.FileInput( files=files, openhook=fileinput.hook_encoded('utf-8') ) log.info('Done loading data.') return raw_data
def concatenate_files(input_directory, output_file): """ :param input_directory: :param output_file: :return: """ assert os.path.isdir(input_directory), 'input path should be a directory' if not input_directory.endswith('/'): input_directory = ''.join((input_directory, '/')) if not check_file_exist(output_file): file_names = os.listdir(input_directory) file_paths = [''.join((input_directory, f_n)) for f_n in file_names] with open(output_file, 'w', encoding='utf-8') as out_file: in_file = fileinput.input(files=file_paths, openhook=fileinput.hook_encoded('utf-8')) # python 2.7.10, fileinput doest not have `__exit__` --> cannot use `with` for line in in_file: out_file.write(line) in_file.close()
def main(): Topic = [] Utterance = [] Relevance = [] regex = u'[^ぁ-ん]+' all_filepaths = glob.glob('./training/*') for filepath in all_filepaths: lines = [ line.rstrip() for line in fileinput.input( filepath, openhook=fileinput.hook_encoded('utf-8')) ] # JSON全体の文法チェック try: arguments = json.loads('\n'.join(lines)) except json.JSONDecodeError as e: print('エラーあり') print(e) exit(1) # Display title #print(arguments[0]["Topic"]) for argument in arguments: Topic.append(argument["Topic"]) Utterance.append(argument["Utterance"]) Relevance.append(argument["Relevance"]) TrueDataset = {} correctAnswer = 0 for line in list(set(Utterance)): T_List = [] R_list = [] for line_l in range(len(Utterance)): if line == Utterance[line_l]: T_List.append(Topic[line_l]) R_list.append(Relevance[line_l]) TrueDataset[Counter(T_List).most_common()[0][0] + ":" + line] = str( Counter(R_list).most_common()[0][0]) # Analyze Utterance using Juman++ & knp jumanpp = Jumanpp() with open("incorrect.txt", "w") as wf: line_cnt = len(TrueDataset) now_line_cnt = 0 for key, label in TrueDataset.items(): tpc, utr = key.split(":")[0], key.split(":")[1] #print(tpc + ":" + utr + "[" + label + "]") #parse Topic topic_analyed_List = [] try: #0.7909880035111675 #s = tpc.split("を")[-2] + "を" + tpc.split("を")[-1].split("べきである")[0] #topic_result = jumanpp.analysis(s) topic_result = jumanpp.analysis(format_text(tpc)) #print(s) for mrph in topic_result.mrph_list(): try: if len(re.findall(regex, mrph.midasi)) > 0: if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): topic_analyed_List.append(mrph.midasi) except: continue except: #print("Error.",tpc) continue #parse Utterance utter_analyed_List = [] try: utter_result = jumanpp.analysis(utr) for mrph in utter_result.mrph_list(): try: if len(re.findall(regex, mrph.midasi)) > 0: if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): utter_analyed_List.append(mrph.midasi) except: continue except: #print("Error.",utr) continue #print((set(topic_analyed_List) & set(utter_analyed_List)),len(set(topic_analyed_List) & set(utter_analyed_List))) if (len(set(topic_analyed_List) & set(utter_analyed_List)) > 0): #print("1:",label) if int(label) == 1: correctAnswer += 1 else: wf.write(tpc + ":" + utr + "[" + "1" + ":" + label + "]\n") else: #print("0:",label) if int(label) == 0: correctAnswer += 1 else: wf.write(tpc + ":" + utr + "[" + "0" + ":" + label + "]\n") now_line_cnt += 1 #print( now_line_cnt, "/", line_cnt) print("acurracy:", correctAnswer * 1.0 / len(TrueDataset))
default="utf-8", choices=["utf-8", "utf-8-sig", "utf-16"], help="Input file encoding") parser.add_argument("-v", action='store_true', default=False) args = parser.parse_args() if (args.v): print(args) questions = [] # with open(args.inputfile, 'r', encoding='utf-8') as f: with fileinput.FileInput(files=args.inputfile, mode='r', openhook=fileinput.hook_encoded(args.encoding)) as f: fw = FileWrapper(f) qtype = QType.MC while True: line = fw.readline() # print(line) if not line: break if m := re.match(r"Type:\s*(F|FIB|MC|FIB_PLUS|FMB|E|ESS)$", line): qtype = Str2QType[m.group(1)] if args.v: print("Question type:", t.name) elif m := re.match(r"(\d+)\.\s+(.+)", line): fw.unreadline(line) if qtype == QType.FIB: q = FIBQuestion() load_question(q, fw, True)
def main(): all_filepaths = glob.glob('./training/*') #print("frhifr",all_filepaths) Topic = [] Utterance = [] Relevance = [] FactCheck = [] Stance = [] for filepath in all_filepaths: # args = get_args() # JSON読み込み # src = '-' if not hasattr(args, 'json_file') else args.json_file lines = [ line.rstrip() for line in fileinput.input( filepath, openhook=fileinput.hook_encoded('utf-8')) ] # JSON全体の文法チェック try: arguments = json.loads('\n'.join(lines)) except json.JSONDecodeError as e: print('エラーあり') print(e) exit(1) # Display title #print(arguments[0]["Topic"]) for argument in arguments: Topic.append(argument["Topic"]) Utterance.append(argument["Utterance"]) Relevance.append(argument["Relevance"]) FactCheck.append(argument["Fact-checkability"]) Stance.append(argument["Stance"]) TrueDataset = [] for line in list(set(Utterance)): cnt = 0 R_list = [] F_list = [] S_list = [] for line_l in range(len(Utterance)): if line == Utterance[line_l]: cnt += 1 R_list.append(Relevance[line_l]) F_list.append(FactCheck[line_l]) S_list.append(Stance[line_l]) plane = line + " " + str( Counter(R_list).most_common()[0][0]) + " " + str( Counter(F_list).most_common()[0][0]) + " " + str( Counter(S_list).most_common()[0][0]) if not ((cnt == 5 and Counter(S_list).most_common()[0][1] == 2) or (cnt == 3 and Counter(S_list).most_common()[0][1] == 1)): TrueDataset.append(plane) # Analyze Utterance using Juman++ jumanpp = Jumanpp() for arguments in TrueDataset: #print(argument["Utterance"],argument["Relevance"],argument["Fact-checkability"],argument["Stance"],argument["Class"]) argument = arguments.split(" ") result = jumanpp.analysis(argument[0]) analyed_argument = "" for mrph in result.mrph_list(): if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): analyed_argument += mrph.midasi + " " analyed_argument += "\t" analyed_argument += argument[1] + "\t" analyed_argument += argument[2] + "\t" analyed_argument += argument[3] print(analyed_argument)
def weave(self): self.abstract_task_desc.setdefault('extra C files', dict()) clade = Clade(self.conf['build base']) if not clade.work_dir_ok(): raise RuntimeError('Build base is not OK') meta = clade.get_meta() # This is required to get compiler (Aspectator) specific stdarg.h since kernel C files are compiled # with "-nostdinc" option and system stdarg.h couldn't be used. aspectator_search_dir = '-isystem' + klever.core.utils.execute( self.logger, ('aspectator', '-print-file-name=include'), collect_all_stdout=True)[0] env = dict(os.environ) # Print stubs instead of inline Assembler since verifiers do not interpret it and even can fail. env['LDV_INLINE_ASM_STUB'] = '' for grp in self.abstract_task_desc['grps']: self.logger.info('Weave in C files of group "{0}"'.format(grp['id'])) for extra_cc in grp['Extra CCs']: # Each CC is either pair (compiler command identifier, compiler command type) or JSON file name # with compiler command description. if isinstance(extra_cc['CC'], list): cc = clade.get_cmd(*extra_cc['CC'], with_opts=True) else: with open(os.path.join(self.conf['main working directory'], extra_cc['CC']), encoding='utf8') as fp: cc = json.load(fp) # extra_cc is a cc command that is not from Clade # Thus paths in it need to be converted to be absolute # like in other Clade commands if "cwd" in cc and "in" in cc: cc["in"] = [os.path.join(cc["cwd"], cc_in) for cc_in in cc["in"]] if "cwd" in cc and "out" in cc: cc["out"] = [os.path.join(cc["cwd"], cc_out) for cc_out in cc["out"]] if "in file" in extra_cc: # This is for CC commands with several input files infile = extra_cc["in file"] else: infile = cc["in"][0] # Distinguish source files having the same names. outfile_unique = '{0}.c'.format(klever.core.utils.unique_file_name(os.path.splitext(os.path.basename( infile))[0], '.c')) # This is used for storing/getting to/from cache where uniqueness is guaranteed by other means. outfile = '{0}.c'.format(os.path.splitext(os.path.basename(infile))[0]) self.logger.info('Weave in C file "{0}"'.format(infile)) # Produce aspect to be weaved in. if 'plugin aspects' in extra_cc: self.logger.info('Concatenate all aspects of all plugins together') # Resulting aspect. aspect = 'aspect' # Get all aspects. Place RSG aspects at beginning since they can instrument entities added by # aspects of other plugins while corresponding function declarations still need be at beginning # of file. aspects = [] for plugin_aspects in extra_cc['plugin aspects']: if plugin_aspects['plugin'] == 'RSG': aspects[0:0] = plugin_aspects['aspects'] else: aspects.extend(plugin_aspects['aspects']) # Concatenate aspects. with open(aspect, 'w', encoding='utf8') as fout, fileinput.input( [os.path.join(self.conf['main working directory'], aspect) for aspect in aspects], openhook=fileinput.hook_encoded('utf8')) as fin: for line in fin: fout.write(line) else: # Instrumentation is not required when there is no aspects. But we will still pass source files # through C-backend to make resulting code to look similarly and thus to avoid different issues # at merging source files and models together. aspect = None if aspect: self.logger.info('Aspect to be weaved in is "{0}"'.format(aspect)) else: self.logger.info('C file will be passed through C Back-end only') storage_path = clade.get_storage_path(infile) if meta['conf'].get('Compiler.preprocess_cmds', False) and \ 'klever-core-work-dir' not in storage_path: storage_path = storage_path.split('.c')[0] + '.i' cwd = clade.get_storage_path(cc['cwd']) is_model = (grp['id'] == 'models') # Original sources should be woven in and we do not need to get cross references for them since this # was already done before. if not is_model: self.__weave(storage_path, cc['opts'], aspect, outfile_unique, clade, env, cwd, aspectator_search_dir, is_model) # For generated models we need to weave them in (actually, just pass through C Back-end) and to get # cross references always since most likely they all are different. elif 'generated' in extra_cc: self.__weave(storage_path, cc['opts'], aspect, outfile_unique, clade, env, cwd, aspectator_search_dir, is_model) if self.conf['code coverage details'] != 'Original C source files': self.__get_cross_refs(storage_path, cc['opts'], outfile_unique, clade, cwd, aspectator_search_dir) # For non-generated models use results cache in addition. else: cache_dir = os.path.join(self.conf['cache directory'], klever.core.utils.get_file_checksum(storage_path)) with klever.core.utils.LockedOpen(cache_dir + '.tmp', 'w'): if os.path.exists(cache_dir): self.logger.info('Get woven in C file from cache') self.abstract_task_desc['extra C files'].append( {'C file': os.path.relpath(os.path.join(cache_dir, os.path.basename(outfile)), self.conf['main working directory'])}) if self.conf['code coverage details'] != 'Original C source files': self.logger.info('Get cross references from cache') self.__merge_additional_srcs(os.path.join(cache_dir, 'additional sources')) else: os.makedirs(cache_dir) self.__weave(storage_path, cc['opts'], aspect, outfile_unique, clade, env, cwd, aspectator_search_dir, is_model) self.logger.info('Store woven in C file to cache') shutil.copy(outfile_unique, os.path.join(cache_dir, outfile)) if self.conf['code coverage details'] != 'Original C source files': self.__get_cross_refs(storage_path, cc['opts'], outfile_unique, clade, cwd, aspectator_search_dir) self.logger.info('Store cross references to cache') shutil.copytree(outfile_unique + ' additional sources', os.path.join(cache_dir, 'additional sources')) # For auxiliary files there is no cross references since it is rather hard to get them from Aspectator. But # there still highlighting. if self.conf['code coverage details'] == 'All source files': for aux_file in glob.glob('*.aux'): new_file = os.path.join('additional sources', 'generated models', os.path.relpath(aux_file, self.conf['main working directory'])) os.makedirs(os.path.dirname(new_file), exist_ok=True) shutil.copy(aux_file, new_file) cross_refs = CrossRefs(self.conf, self.logger, clade, aux_file, new_file, self.search_dirs) cross_refs.get_cross_refs() self.abstract_task_desc['additional sources'] = os.path.relpath('additional sources', self.conf['main working directory']) \ if os.path.isdir('additional sources') else None # Copy additional sources for total code coverage. if self.conf['code coverage details'] != 'Original C source files': with klever.core.utils.Cd('additional sources'): for root, dirs, files in os.walk(os.path.curdir): for file in files: # These files are handled below in addition to corresponding source files. if file.endswith('.json'): continue if self.conf['code coverage details'] == 'C source files including models' \ and not file.endswith('.c'): continue file = os.path.join(root, file) new_file = os.path.join(self.conf['additional sources directory'], file) os.makedirs(os.path.dirname(new_file), exist_ok=True) with klever.core.utils.LockedOpen(new_file + '.tmp', 'w'): if os.path.isfile(new_file): os.remove(new_file + '.tmp') continue shutil.copy(file, new_file) shutil.copy(file + '.idx.json', new_file + '.idx.json') os.remove(new_file + '.tmp') # These sections won't be refereed any more. del (self.abstract_task_desc['grps']) del (self.abstract_task_desc['deps'])
def check(mode, expected_lines): fi = FileInput(files=TESTFN, mode=mode, openhook=hook_encoded('utf-7')) lines = list(fi) fi.close() self.assertEqual(lines, expected_lines)
def parse_csv(): """Given the location of CSV and TXT files, parse the CSV for notable items""" error_output = list() Container = list() if Config.yara_folder and has_yara: yara_rules = yara_import_rules() else: yara_rules = '' if Config.debug: print('[_] Loaded rules:', type(yara_rules)) # Use fileinput.input() now to read data line-by-line if Config.debug: print('[_] Parsing in CSV contents...') for original_line in fileinput.input( Config.csv_file, openhook=fileinput.hook_encoded('iso-8859-1')): evt = None server = '' # Ignore lines beginning w/ a tab or non-quote. if original_line[0] != '"': continue line = original_line.strip(whitespace + '"') field = line.strip().split('","') try: if field[3] in ['Process Create'] and field[5] == 'SUCCESS': cmdline = field[6].split('Command line: ')[1] if not blacklist_scan(cmd_blacklist, field): if Config.generalize_paths: cmdline = generalize_var(cmdline) child_pid = field[6].split('PID: ')[1].split(',')[0] evt = Event(time=field[0], group='Process', activity='CreateProcess', process=field[1], PID=field[2], process_value=cmdline.replace('"', ''), child_pid=child_pid) elif field[3] == 'CreateFile' and field[5] == 'SUCCESS': if not blacklist_scan(file_blacklist, field): path = field[4] if os.path.isdir(path): if Config.generalize_paths: path = generalize_var(path) evt = Event(time=field[0], group='File', activity='CreateFolder', process=field[1], PID=field[2], process_value=path) else: yara_hits = '' av_hits = '' if Config.generalize_paths: path = generalize_var(path) evt = Event(time=field[0], group='File', activity='CreateFile', process=field[1], PID=field[2], process_value=path) if file_exists(path): if Config.debug: print('[_] File: %s\texists' % path) try: md5 = md5_file(path) evt.tags['MD5'] = md5 if Config.debug: print('[_]\t%s' % md5) except (IndexError, IOError): md5 = '' if Config.debug: print('[_]\tMD5 could not be calculated') if Config.yara_folder and yara_rules: print('[*] Scanning with YARA: %s' % path) yara_hits = yara_filescan(path, yara_rules) if yara_hits: evt.tags['YARA'] = yara_hits if Config.debug: print('[_] YARA: %s' % yara_hits) else: if Config.debug: print('[_] No YARA hits.') if has_virustotal: av_hits = virustotal_scan_file(md5) if av_hits: evt.tags['VirusTotal'] = av_hits if Config.debug: print('[_] VT: %s' % av_hits) elif field[3] == 'SetDispositionInformationFile' and field[ 5] == 'SUCCESS': if not blacklist_scan(file_blacklist, field): path = field[4] if Config.generalize_paths: path = generalize_var(path) evt = Event(time=field[0], group='File', activity='DeleteFile', process=field[1], PID=field[2], process_value=path) elif field[3] == 'SetRenameInformationFile': if not blacklist_scan(file_blacklist, field): from_file = field[4] to_file = field[6].split('FileName: ')[1].strip('"') if Config.generalize_paths: from_file = generalize_var(from_file) to_file = generalize_var(to_file) evt = Event(time=field[0], group='File', activity='RenameFile', process=field[1], PID=field[2], process_value='%s => %s' % (from_file, to_file)) elif field[3] == 'RegCreateKey' and field[5] == 'SUCCESS': if not blacklist_scan(reg_blacklist, field): evt = Event(time=field[0], group='Registry', activity='RegCreateKey', process=field[1], PID=field[2], process_value=field[4]) elif field[3] == 'RegSetValue' and field[5] == 'SUCCESS': if not blacklist_scan(reg_blacklist, field): reg_length = field[6].split('Length:')[1].split( ',')[0].strip(whitespace + '"') if int(reg_length): data_field = field[6].split('Data:')[1].strip( whitespace + '"') if len(data_field.split(' ')) == 16: data_field += ' ...' evt = Event(time=field[0], group='Registry', activity='RegSetValue', process=field[1], PID=field[2], process_value='%s = %s' % (field[4], data_field)) elif field[3] == 'RegDeleteValue': # and field[5] == 'SUCCESS': # SUCCESS is commented out to allows all attempted deletions, whether or not the value exists if not blacklist_scan(reg_blacklist, field): evt = Event(time=field[0], group='Registry', activity='RegDeleteValue', process=field[1], PID=field[2], process_value=field[4]) elif field[3] == 'RegDeleteKey': # and field[5] == 'SUCCESS': # SUCCESS is commented out to allows all attempted deletions, whether or not the value exists if not blacklist_scan(reg_blacklist, field): evt = Event(time=field[0], group='Registry', activity='RegDeleteKey', process=field[1], PID=field[2], process_value=field[4]) elif (field[3] == 'UDP Send' or field[3] == 'UDP Receive') and field[5] == 'SUCCESS': if not blacklist_scan(net_blacklist, field): server = field[4].split('-> ')[1] hostname = server.split(':')[0] # TODO: work on this later, once I can verify it better. #if field[6] == 'Length: 20': # output_line = '[DNS Query] %s:%s > %s' % (field[1], field[2], protocol_replace(server)) #else: evt = Event(time=field[0], group='Network', activity='UDP', process=field[1], PID=field[2], process_value=protocol_replace(server), hostname=hostname) elif (field[3] == 'TCP Send' or field[3] == 'TCP Receive') and field[5] == 'SUCCESS': if not blacklist_scan(net_blacklist, field): server = field[4].split('-> ')[1] hostname = server.split(':')[0] evt = Event(time=field[0], group='Network', activity='TCP', process=field[1], PID=field[2], process_value=protocol_replace(server), hostname=hostname) except IndexError: if Config.debug: sys.stderr.write(line) sys.stderr.write(format_exc()) error_output.append(original_line.strip()) if evt: Container.append(evt) if error_output: error_str = '' error_str += '\r\n\r\n\r\n\r\n\r\n\r\nERRORS DETECTED' error_str += 'The following items could not be parsed correctly:' for error in error_output: error_str += error #} End of file input processing return Container
sqlElements += ',%s' sqlUpdate += ',' + header[i] + '=values(' + header[i] + ')' sqlVal = sqlVal.replace(",", "", 1) sqlElements = sqlElements.replace(",", "", 1) sqlUpdate = sqlUpdate.replace(",", "", 1) for root, dirs, files in os.walk(os.path.join('./')): for name in files: fileName = os.path.join(root, name) if name.endswith('csv'): print(fileName) sql_value = [] c = 0 for line in fileinput.input(fileName, openhook=fileinput.hook_encoded( "utf-8", "surrogateescape")): if c == 0: c = c + 1 continue contentList = line.split(',') tp = [] for i in range(len(header)): tp.append(contentList[i]) sql_value.append(tuple(tp)) cursor.execute(""" IF OBJECT_ID('test', 'U') IS NOT NULL DROP TABLE test CREATE TABLE test (
def main(self): hist = [] current = 0 parser = argparse.ArgumentParser() parser.add_argument('--title', help='set the plot title') parser.add_argument('--encoding', help='E.g. iso-8895-1, utf-8') parser.add_argument('--fly', help='Update the plot on the fly', action='store_true') parser.add_argument('--verbose', help='Output verbosely', action='store_true') parser.add_argument('--run-command', help='Output verbosely', action='store_true') parser.add_argument( '--range-max', help='set the max value to filter out the abnormal', type=int) parser.add_argument( '--range-min', help='set the min value to filter out the abnormal', type=int) parser.add_argument('--xliml', help='The left xlim in data coordinates', type=float) parser.add_argument('--xlimr', help='The right xlim in data coordinates', type=float) parser.add_argument('--config', help='Config INI file') parser.add_argument('--output', help='Record log') parser.add_argument('file', metavar='FILE', help='files to read, if empty, stdin is used') args = parser.parse_args() if args.encoding: self.ENCODING = args.encoding if args.title: self.TITLE = args.title if args.range_max: self.range_max = args.range_max if args.range_min: self.range_min = args.range_min if args.xliml: self.XLIM_LEFT = args.xliml if args.xlimr: self.XLIM_RIGHT = args.xlimr if not args.run_command: self.command = None if args.output: self.log_output = args.output if args.config: self.config = args.config self.read_config_file() self.verbose = args.verbose self.UPDATE_ON_THE_FLY = args.fly if self.log_output: self.output_file = open(self.log_output, 'w') self.run_command() for line in fileinput.input(args.file, openhook=fileinput.hook_encoded( self.ENCODING)): line = line.strip() line_no = fileinput.lineno() self.line = line print_log = False print_current = False print_end = False for pattern in self.PRINT_PATTERN: if pattern in line: self.t_current = LogTimeProfiler.parse_time(line) print_log = True if not self.measure_started and self.MEASURE_START in line: self.measure_started = True print('measure_started =', self.measure_started) self.t_current = LogTimeProfiler.parse_time(line) self.t_request = self.t_current self.t_session_start = self.t_current print_log = True elif self.measure_started and self.MEASURE_END in line: self.measure_started = False print('measure_started =', self.measure_started) self.t_current = LogTimeProfiler.parse_time(line) self.t_response = self.t_current print_log = True current = round(self.t_response - self.t_request, 2) print(self.command, self.command_delay) if self.command and self.command_delay: if self.action_timer: self.action_timer.cancel() self.action_timer = Timer(self.command_delay / 1000.0, self.run_command, [True]) self.action_timer.start() if current < 0: self.print_log(line_no, line) continue if self.range_min and current < self.range_min: self.print_log(line_no, line) continue if self.range_max and current > self.range_max: self.print_log(line_no, line) continue print_current = True print_end = True hist.append(current) self.test_count += 1 if self.UPDATE_ON_THE_FLY: self.show_plot(hist) if self.verbose and print_log: self.print_log(line_no, self.t_current - self.t_request, line) if print_current: self.print_log(current) if print_end: self.print_log('-' * 80) if self.command and self.command_count == self.test_count: break self.print_log('=' * 10, self.TITLE, 'Summary', '=' * 10) self.print_log('Result Count: {}'.format(len(hist))) self.print_log( 'Benchmark: max = {}, min = {}, mean = {:.2f}, std = {:.2f}, mode = {:.2f}' .format(max(hist), min(hist), np.mean(hist), np.std(hist), np.median(hist))) self.print_log(hist) self.show_plot(hist) plt.show()
def run(target_dir, inplaceFlag=0): global showHeaderTableCount, alreadyChanged, showHeaderTableChanged retrieveFlag = 0 # 1: start retrieve; 2: end retrieve for root, dirs, files in os.walk(target_dir): for file in files: showHeaderTable = '' oldHeader='' if file.endswith('.jsp') and (file.lower() in teoconstants.uipgms): print('Processing '+file) if inplaceFlag == 0: #improve performance f = fileinput.input(root+"\\"+ file, inplace=inplaceFlag, openhook=fileinput.hook_encoded('utf-8')) elif inplaceFlag == 1: f = fileinput.input(root+"\\"+ file, inplace=inplaceFlag) for i, line in enumerate(f): if(re.search('posui:showHeaderTable', line, re.IGNORECASE)): showHeaderTableCount += 1 retrieveFlag = 1 showHeaderTable += line if retrieveFlag == 1: if(not re.search('posui:showHeaderTable', line))and(not re.search('/>', line)): showHeaderTable += line indent = (re.search('^(?P<indent>[ \t]*)[a-zA-Z\</\n]?',line,re.IGNORECASE)).group('indent') if(re.search('headers.*\=.*"\<%\=',showHeaderTable)): # E.g: headers = "<%=headerTit%>" if inplaceFlag == 0: print(' Unappropriate header found at line '+str(i+1)) elif inplaceFlag == 1: print(line) retrieveFlag = 2 continue elif(re.search('headers.*\=',line)and(line.count('"')==2)): headIndent = indent oldHeader=(re.search('.*\"(?P<header>.*)\"',line)).group('header') if(inplaceFlag == 0): print('oldHeader case 1='+oldHeader) line = appendToolTipsToNewHeader(oldHeader, headIndent) if inplaceFlag == 1: line = line.encode('utf-8') oldHeader='' elif(re.search('headers.*\=',line)and(line.count('"')==1)): headIndent = indent oldHeader=(re.search('.*\"(?P<header>.*)',line)).group('header') continue elif((oldHeader != '') and (line.count('"')==0)): oldHeader += (re.search('^[ \t]*(?P<header>.*)',line)).group('header') continue elif((oldHeader != '') and (line.count('"')==1)): oldHeader += (re.search('^[ \t]*(?P<header>.*)\"',line)).group('header') oldHeader = oldHeader.replace('\r','') oldHeader = re.sub(';$','',oldHeader) if(inplaceFlag == 0): print('oldHeader case 2='+oldHeader) line = appendToolTipsToNewHeader(oldHeader, headIndent) if inplaceFlag == 1: line = line.encode('utf-8') oldHeader='' if (re.search('/>', line)): retrieveFlag = 2 if (not re.search('isMultiLang', showHeaderTable)): line = indent+'isMultiLang="true"\n' \ +indent+'tableEvent="nowrap style=\'table-layout:fixed\'"\n' \ +indent+'toolTipLocales="en"\n' \ +indent+'isColspanFix="true"\n' \ +indent+'isNoToolTipScript="true"\n' \ +line else: alreadyChanged += 1 if inplaceFlag == 0: print('fffffffffffffffffffffffff') showHeaderTable += line if inplaceFlag == 0: sys.stdout.write(showHeaderTable) if retrieveFlag == 2: showHeaderTable = '' retrieveFlag = 0 if inplaceFlag == 1: sys.stdout.write(line) f.close()
import fileinput import pprint results = {} words_to_look_for = ('monster', 'monsters') with fileinput.input(openhook=fileinput.hook_encoded("utf-8")) as f: for line in f: for word in line.split(' '): word = word.lower().strip('"') if word in results: results[word] += 1 else: if word in words_to_look_for: results[word] = 1 print("Results: ") pprint.pprint(results)
def main(args): global _stash ap = argparse.ArgumentParser() ap.add_argument('pattern', help='the pattern to match') ap.add_argument('files', nargs='*', help='files to be searched') ap.add_argument('-i', '--ignore-case', action='store_true', help='ignore case while searching') ap.add_argument('-v', '--invert', action='store_true', help='invert the search result') ap.add_argument('-c', '--count', action='store_true', help='count the search results instead of normal output') ns = ap.parse_args(args) flags = 0 if ns.ignore_case: flags |= re.IGNORECASE pattern = re.compile(ns.pattern, flags=flags) # Do not try to grep directories files = [f for f in ns.files if not os.path.isdir(f)] fileinput.close() # in case it is not closed try: counts = collections.defaultdict(int) for line in fileinput.input(files, openhook=fileinput.hook_encoded("utf-8")): if bool(pattern.search(line)) != ns.invert: if ns.count: counts[fileinput.filename()] += 1 else: if ns.invert: # optimize: if ns.invert, then no match, so no highlight color needed newline = line else: newline = re.sub( pattern, lambda m: _stash.text_color(m.group(), 'red'), line) if fileinput.isstdin(): fmt = u'{lineno}: {line}' else: fmt = u'{filename}: {lineno}: {line}' print( fmt.format(filename=fileinput.filename(), lineno=fileinput.filelineno(), line=newline.rstrip())) if ns.count: for filename, count in counts.items(): fmt = u'{count:6} {filename}' print(fmt.format(filename=filename, count=count)) except Exception as err: print("grep: {}: {!s}".format(type(err).__name__, err), file=sys.stderr) finally: fileinput.close()
def main(): print("fsovs") Topic = [] Utterance = [] Relevance = [] regex = u'[^ぁ-ん]+' #学習用データ form[label, Topic & Utterce] wf_Data = open("Tpc&UTR_Stance.csv","w") all_filepaths=glob.glob('./training/*') for filepath in all_filepaths: lines = [line.rstrip() for line in fileinput.input( filepath, openhook=fileinput.hook_encoded('utf-8'))] # JSON全体の文法チェック try: arguments = json.loads('\n'.join(lines)) except json.JSONDecodeError as e: print('エラーあり') print(e) exit(1) # Display title #print(arguments[0]["Topic"]) for argument in arguments: Topic.append(argument["Topic"]) Utterance.append(argument["Utterance"]) Relevance.append(argument["Stance"]) TrueDataset = {} correctAnswer_0 = 0 correctAnswer_1 = 0 for line in list(set(Utterance)): T_List = [] R_list = [] for line_l in range(len(Utterance)): if line == Utterance[line_l]: T_List.append(Topic[line_l]) R_list.append(Relevance[line_l]) TrueDataset[Counter(T_List).most_common()[0][0] + ":" + line] = str(Counter(R_list).most_common()[0][0]) sorted(TrueDataset.items()) # Analyze Utterance using Juman++ & knp jumanpp = Jumanpp() with open("incorrectTrus.txt","w") as wf: line_cnt = len(TrueDataset) now_line_cnt = 0 for key, label in TrueDataset.items(): tpc,utr = key.split(":")[0],key.split(":")[1] topANDutrANDlabelList = [] #parse Topic topic_analyed_List = [] topANDutrANDlabelList.append("Topic") try: #0.7909880035111675 #s = tpc.split("を")[-2] + "を" + tpc.split("を")[-1].split("べきである")[0] #topic_result = jumanpp.analysis(s) topic_result = jumanpp.analysis(format_text(tpc)) #print(s) for mrph in topic_result.mrph_list(): try : if len(re.findall(regex, mrph.genkei)) > 0: if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): if "数量" in mrph.imis: topic_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append("[数]") else: topic_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append(mrph.genkei) except: continue except: continue #parse Utterance utter_analyed_List = [] topANDutrANDlabelList.append("Utterance") try: if "、" in utr: utrList = utr.split("、") for sentence in utrList: #reigi if sentence == "": continue utter_result = jumanpp.analysis(sentence) for mrph in utter_result.mrph_list(): try : if len(re.findall(regex, mrph.genkei)) > 0: if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): if "数量" in mrph.imis: utter_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append("[数]") else: utter_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append(mrph.genkei) else: continue except: print("error") continue else: utter_result = jumanpp.analysis(utr) for mrph in utter_result.mrph_list(): try : if len(re.findall(regex, mrph.genkei)) > 0: if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi): if "数量" in mrph.imis: utter_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append("[数]") else: utter_analyed_List.append(mrph.genkei) topANDutrANDlabelList.append(mrph.genkei) except: print("error") continue topANDutrANDlabelList.append("END") except: print("error") continue if "END" in topANDutrANDlabelList: #print(topANDutrANDlabelList) wf_Data.write(str(label) + "," + " ".join(topANDutrANDlabelList[:-1])+"\n") #print((set(topic_analyed_List) & set(utter_analyed_List)),len(set(topic_analyed_List) & set(utter_analyed_List))) #if (len(set(topic_analyed_List) & set(utter_analyed_List)) > 0): #print("1:",label) if int(label) == 1: wf.write(tpc + ":" + utr + "[" + "1" + ":" +label + "]\n") elif int(label) == 2: wf.write(tpc + ":" + utr + "[" + "2" + ":" +label + "]\n") else: wf.write(tpc + ":" + utr + "[" + "0" + ":" +label + "]\n")
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/3/10 17:36 # @Author : JJkinging # @File : test.py import fileinput symptom = list( map( lambda x: x.strip(), fileinput.FileInput( r'D:\python_project\AI_doctor\doctor_offline\structured\reviewed\2型糖尿病.csv', openhook=fileinput.hook_encoded('utf-8')))) print(symptom)
def main(): parser = argparse.ArgumentParser( description="""Takes a file containing "word_i marker word_j" tuples and builds a coocurrence count -core space- for each marker. It will also build a sparse matrix for detected compositions -peripheral space- (e.g. word_i1<-->word_i2 marker word_j""") parser.add_argument('input', help="coocurrence tuples", default="-", nargs='*') parser.add_argument('-v', '--verbose', action='count', default=0) parser.add_argument( '-o', '--output_dir', help="directory where a coocurrence count file will be created " "for each pattern", required=True) parser.add_argument('-x', '--compose-op', help='string using to identify' ' a peripheral space token', default='<-->') parser.add_argument('-c', '--cols', help='filter context words') parser.add_argument('-r', '--rows', help='filter pivots') parser.add_argument('-m', '--many', help='number of records needed to ' 'start dumping', type=int, default=MANY) parser.add_argument('-b', '--batch-size', help='size of batchs inserted ' 'into the DB', type=int, default=BATCH_SIZE) parser.add_argument('-e', '--db-engine', help="Destination format", choices=['mysql', 'sqlite', 'text'], default='text') parser.add_argument('--asynchronic', dest='synchronic', help='continue counting while saving', action='store_false', default=True) parser.add_argument('-u', '--mysql_user', help='MYSQL username', default=MYSQL_USER) parser.add_argument('-p', '--mysql_passwd', help='MYSQL password', default=MYSQL_PASS) parser.add_argument('-H', '--mysql_hostname', help='MYSQL hostname', default=MYSQL_HOST) parser.add_argument('-P', '--mysql_port', help='MySQL port', default=MYSQL_PORT, type=int) #TODO: add option to customize dense or sparse args = parser.parse_args() if args.verbose == 0: logger.setLevel(logging.ERROR) if args.verbose == 1: logger.setLevel(logging.INFO) if args.verbose == 2: logger.setLevel(logging.DEBUG) logger.info("Started at {0}".format(str( time.strftime("%d-%m-%Y %H:%M:%S")))) #make sure outdir exists try: os.makedirs(args.output_dir) except OSError: pass if args.cols: with open(args.cols) as f_cols: cols = [col.rstrip('\n') for col in f_cols] col2id = dict((col, i) for i, col in enumerate(cols)) else: cols = None col2id = None if args.rows: with open(args.rows) as f_rows: rows = [row.rstrip('\n') for row in f_rows] row2id = dict((row, i) for i, row in enumerate(rows)) else: rows = None row2id = None if args.db_engine == 'mysql': per_output_db = args.output_dir + '_peripheral' core_output_db = args.output_dir + '_core' per_dest = MySQLDestination(args.hostname, args.port, args.user, args.passwd, per_output_db, ['cc'], args.batch_size) core_dest = MySQLDestination(args.hostname, args.port, args.user, args.passwd, core_output_db, ['cc'], args.batch_size) elif args.db_engine == 'sqlite': per_output_db = os.path.join(args.output_dir, 'peripheral.db') core_output_db = os.path.join(args.output_dir, 'core.db') per_dest = SqliteDestination(per_output_db, args.batch_size) core_dest = SqliteDestination(core_output_db, args.batch_size) elif args.db_engine == 'text': per_output_db = os.path.join(args.output_dir, 'peripheral') core_output_db = os.path.join(args.output_dir, 'core') per_dest = TextDestination(per_output_db) core_dest = TextDestination(core_output_db) with core_dest, per_dest: core = SparseCounter(core_dest, args.many, args.synchronic) per = SparseCounter(per_dest, args.many, args.synchronic) with Timer() as t_counting: try: i = 0 for l in fileinput.input( args.input, openhook=fileinput.hook_encoded("utf-8")): i += 1 if i % 100000 == 0: sys.stdout.write('.') if i % 10000000 == 0: sys.stdout.write('\n') sys.stdout.flush() [w1, w2] = l.rstrip('\n').split('\t') if args.compose_op in w1: tg = w1.split(args.compose_op)[1] if (not row2id or tg in row2id) and (not col2id or w2 in col2id): per.count(w1, 'c', w2) else: if (not row2id or w1 in row2id) and (not col2id or w2 in col2id): core.count(w1, 'c', w2) except ValueError: logger.error("Error reading line: {0}".format(l)) logger.info("Counting Finished (t={0:.2f})".format( t_counting.interval)) #wait for any pending saves core.join() per.join() #save residuals while len(core) > 0: core.save() while len(per) > 0: per.save() logger.info("Finished at {0}".format( str(time.strftime("%d-%m-%Y %H:%M:%S"))))
'L': {'P': Counter(numer=0, denom=0), 'R': Counter(numer=0, denom=0)}} return confs, softPR data = defaultdict(newConfsMap) # labelset => confs map nSeqs = Counter() # label set => number of sequences having some (predicted or gold) label in the set nTokens = Counter() # label set => number of tokens in the sequences corresponding to this label set allLabels = set() global nIgnoredTokens, nIgnoredSeqs nIgnoredTokens = 0 nIgnoredSeqs = 0 sys.stdin = codecs.getreader("utf-8")(sys.stdin) for seq in loadSequences(fileinput.input(args.conllFiles, openhook=fileinput.hook_encoded("utf-8")), scheme): tkns,golds,preds = zip(*seq) tkns,golds,preds = list(tkns),list(golds),list(preds) labelsThisSeq = set(itm[1] for itm in golds+preds if itm[0]!='O') allLabels.update(labelsThisSeq) selectedLbls = args.l if selectedLbls: lblsets = {tuple(selectedLbls)} # a specific subset of labels elif args.L: lblsets = {()} # all labels else: lblsets = {(lbl,) for lbl in allLabels} | {()} # all labels, plus each label individually for lblset in lblsets: if lblset==('LOC',):
def run(target_dir, inplaceFlag=0): global showSelectListCount, alreadyChanged, showSelectListChanged, labelList retrieveFlag = 0 # 1: start retrieve; 2: end retrieve showTextFields = '' previousLineIndent = '' for root, dirs, files in os.walk(target_dir): for file in files: if file.endswith('.jsp') and (file.lower() in teoconstants.uipgms): print('Processing ' + file) if inplaceFlag == 0: #improve performance f = fileinput.input( root + "\\" + file, inplace=inplaceFlag, openhook=fileinput.hook_encoded('utf-8')) elif inplaceFlag == 1: f = fileinput.input(root + "\\" + file, inplace=inplaceFlag) label = '' staticValues = '' totalValue = '' for i, line in enumerate(f): if (re.search('posui:showSelectList', line, re.IGNORECASE)): showSelectListCount += 1 retrieveFlag = 1 showTextFields += line if retrieveFlag == 1: if line in ('\n', '\r\n'): # ignore blank line continue if (not re.search('/>', line)) and (not re.search( 'posui:showSelectList', line)): previousLineIndent = (re.search( '^(?P<indent>[ \t]*)[a-zA-Z\</\n]?', line, re.IGNORECASE)).group('indent') if re.search( 'label.*\=', line): ##1111111111111111111111111111111 if (re.search("\<.*\>", line)): if inplaceFlag == 1: sys.stdout.write(line) continue else: m = re.search( '(?P<before>^.*)label.*=.*"(?P<label>.*)"', line) label = m.group('label') keyfoundFlag = 0 keyFound = '' subList = [] subList.append(file) subList.append(i + 1) subList.append(label) if inplaceFlag == 1: label = label.decode('utf-8') keyFound = findCorrespondentKey(label) if keyFound != label: keyfoundFlag = 1 subList.append(1) keyFound = re.sub( '_000\d', '_0000', keyFound) line = m.group( 'before' ) + 'label="' + keyFound + '"\n' ## if inplaceFlag == 0: ## print('FOUNDDDDDDD:'+ keyFound) if inplaceFlag == 1: line = line.encode('utf-8') else: subList.append(0) labelMainList.append(subList) elif (re.search( 'staticValues.*\=', line)): ##222222222222222222222222222 if inplaceFlag == 0: ## print('staticValues FOUND ==>'+line) pass if ( re.search("\<.*\>", line) ): ## E.g: totalValue="<%=PosM800500099ConstantsIF.C_LOV_ALL_VALUE%>" if inplaceFlag == 1: sys.stdout.write(line) continue else: m = re.search( '(?P<before>^.*)staticValues.*=.*"(?P<staticValues>.*)"', line) staticValues = m.group('staticValues') staticValues = re.sub( '|$', '', staticValues) newString = '' for value in re.split('\|', staticValues): subList = [] subList.append(file) subList.append(i + 1) subList.append(value) if inplaceFlag == 1: value = value.decode('utf-8') keyFound = findCorrespondentKey(value) if keyFound != value: subList.append(1) keyFound = re.sub( '_000\d', '_0000', keyFound) newString += keyFound + '|' if inplaceFlag == 0: print('FOUNDDDDDDD:' + keyFound) else: subList.append(0) newString += value + '|' if inplaceFlag == 0: print( 'CAN NOT FIND APPROPRIATE KEY for staticValues' ) labelMainList.append(subList) newString = re.sub('\|$', '', newString) line = m.group( 'before' ) + 'staticValues="' + newString + '"\n' if inplaceFlag == 1: line = line.encode('utf-8') elif (re.search('totalValue.*\=', line)): ##3333333333333333333333 ## if inplaceFlag == 0: ## print('totalValue FOUND ==>'+line) if ( re.search("\<.*\>", line) or re.search('"-+"', line) ): ## E.g: totalValue="<%=PosM800500099ConstantsIF.C_LOV_ALL_VALUE%>" ; totalValue="------------" if inplaceFlag == 1: sys.stdout.write(line) continue else: m = re.search( '(?P<before>^.*)totalValue.*=.*"(?P<totalValue>.*)"', line) totalValue = m.group('totalValue') keyfoundFlag = 0 subList = [] subList.append(file) subList.append(i + 1) subList.append(totalValue) if inplaceFlag == 1: totalValue = totalValue.decode('utf-8') keyFound = findCorrespondentKey(totalValue) if keyFound != totalValue: keyfoundFlag = 1 subList.append(1) line = m.group( 'before' ) + 'totalValue="' + keyFound + '"\n' if inplaceFlag == 0: print('FOUNDDDDDDD:' + keyFound) elif inplaceFlag == 1: line = line.encode('utf-8') else: subList.append(0) labelMainList.append(subList) showTextFields += line if (re.search('/>', line)) and (retrieveFlag == 1): retrieveFlag = 2 if re.search('isMultiLang', showTextFields) \ or re.search('isLabelMultiLang', showTextFields) \ or re.search('isTotalValueMultiLang', showTextFields) : alreadyChanged += 1 if staticValues and not re.search( 'isMultiLang', showTextFields): line = re.sub( '^', previousLineIndent + 'isMultiLang="true"\n', line) else: if label and totalValue == '' and not re.search( 'isLabelMultiLang', showTextFields): # only label line = re.sub( '^', previousLineIndent + 'isLabelMultiLang="true"\n', line) elif totalValue and label == '' and not re.search( 'isTotalValueMultiLang', showTextFields): # only totalValue line = re.sub( '^', previousLineIndent + 'isTotalValueMultiLang="true"\n', line) elif totalValue and label and not re.search( 'isLabelMultiLang', showTextFields) and not re.search( 'isTotalValueMultiLang', showTextFields): line = re.sub( '^', previousLineIndent + 'isLabelMultiLang="true"\n', line) line = re.sub( '^', previousLineIndent + 'isTotalValueMultiLang="true"\n', line) showTextFields += line if inplaceFlag == 0: sys.stdout.write(showTextFields) if retrieveFlag == 2: retrieveFlag = 0 keyFound = '' showTextFields = '' label = '' staticValues = '' totalValue = '' if inplaceFlag == 1: sys.stdout.write(line) f.close()
(station, CODE, julian, latitude, longitude, epic_date)) # write station header in decimal with PRES = -1, and 5 decimals # -------------------------------------------------------------- xml_file.write("%3d %4d %9.5f %8.5f %9.5f %s 1e36 1e36 1e36 1e36\n" % (station, CODE, julian, latitude, longitude, epic_date)) # substitute .hdr or .HDR in fileName with .asc # --------------------------------------------- fileName = re.sub(r'\.(?i)hdr$', '.asc', fileName) # we don't use __builtin__ readline method of file object that haven't # method to get read line number # -------------------------------------------------------------------- file = fileinput.input( fileName, openhook=fileinput.hook_encoded("ISO-8859-1")) # iterate over the lines of opened file "fileName" # ------------------------------------------------ for line in file: # skip header line # ---------------- if file.isfirstline(): continue else: # extract data # ------------ (scan, TimeJ, Pres, Depth, T0, T1, C0, C1, v1, v2, v1dt, v2dt, Xmiss, FlC, Aqua, Ox0, Ox1, S0, S1, sigmateta0, sigmateta1, sndvel0, sndvel1, nbin, flag) = line.split()
s_len = len(s) ngrams = [] for n in xrange(1, min(size + 1, s_len + 1)): for i in xrange(s_len - n + 1): ngrams.append(s[i:i + n]) return ngrams def crfsuite_features(word, size, left_tpl, right_tpl): res = StringIO() for k in xrange(1, len(word)): left, right = word[:k], word[k:] left_size = min(len(left), size) right_size = min(len(right), size) print >> res, '%s\t%s' % (left_tpl[left_size - 1] % tuple( _char_ngrams(left[-size:], size)), right_tpl[right_size - 1] % tuple(_char_ngrams(right[:size], size))) return res.getvalue() if __name__ == '__main__': N = 4 # n-gram size left_tpl = [crfsuite_feature_names(k, True) for k in xrange(1, N + 1)] right_tpl = [crfsuite_feature_names(k, True) for k in xrange(1, N + 1)] for word in fileinput.input(openhook=fileinput.hook_encoded("utf8")): print crfsuite_features(word.strip().lower(), size=N, left_tpl=left_tpl, right_tpl=right_tpl)
# all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import fileinput import struct subFormat = '4s 2x 8s 1x 2s 2x 8s 1x 2s 2x' fi = fileinput.FileInput(openhook=fileinput.hook_encoded("utf_16_le")) outfile = 0 # output file, sets value so that we wouldn't close the file # the first time round insub = 0 # when 1, processing a multiline subtitle while 1: line = fi.readline() print line if line == '': break # If new file, initialize the subtitle counter and open a new file if (fi.isfirstline()): outfilename = (fi.filename())[0:len(fi.filename()) - 3] + u'srt' # close the previous file if outfile != 0:
_msg( " done.\nWords before: %d, words after: %d.\n" "(words constringed: %d, bytes saved: %d)\n%s\n", wcount, len(wlist), c_wcount, c_bsaved, '-' * 60) # myspell'o þodyno pradþioje -- þodþiø kiekis. if myspell: outfile.write(len(wlist) + '\n') outfile.writelines(wlist) if __name__ == "__main__": outfile = sys.stdout # Nuo v2.5+ fileinput galima nurodyti openhook'à (dekodavimas ið # norimos koduotës). Aktualu tik py3 (py2 dirba su byte strings; # perkodavimas á unikodà nebûtinas), taèiau openhook'as neveikia # su stdin. if sys.version_info >= (3, ): import io if not sys.argv[1:]: # jei nëra argumentø, tai duomenys ið stdin sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding=enc) outfile = io.TextIOWrapper(sys.stdout.buffer, encoding=enc) _fileinput = fileinput.input(openhook=fileinput.hook_encoded(enc)) else: _fileinput = fileinput.input() sutrauka(_fileinput, outfile=outfile, myspell=False)
import nltk import string from nltk.corpus import stopwords from nltk.tokenize import RegexpTokenizer import fileinput import sys, glob import time # Toma cualquier nombre del libro con comienzo "Busi_" y terminación ".txt" archivos = glob.glob("../Social Sciences books/training/Soci_*.txt") archivos.sort() for linea in fileinput.input(archivos, openhook=fileinput.hook_encoded("utf-8")): if fileinput.isfirstline(): # Files name book = fileinput.filename() Busi_1 = open(book, encoding="utf-8").read() Busi1 = nltk.word_tokenize(Busi_1) Busi1 = [w.lower() for w in Busi1 if w.isalpha()] stop_words = set(stopwords.words('english')) filtered_book = [w for w in Busi1 if not w in stop_words] single_character = ( 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'eg', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
# This script is for extracting the grammar from the rust docs. import fileinput collections = { "gram": [], "keyword": [], "reserved": [], "binop": [], "unop": [] } in_coll = False coll = "" for line in fileinput.input(openhook=fileinput.hook_encoded("utf-8")): if in_coll: if line.startswith("~~~~"): in_coll = False else: if coll in ["keyword", "reserved", "binop", "unop"]: for word in line.split(): if word not in collections[coll]: collections[coll].append(word) else: collections[coll].append(line) else: if line.startswith("~~~~"): for cname in collections: if ("." + cname) in line:
def parse_csv(csv_file, report, timeline): """ Given the location of CSV and TXT files, parse the CSV for notable items Arguments: csv_file: path to csv output to parse Results: report: string text containing the entirety of the text report timeline: string text containing the entirety of the CSV report """ process_output = list() file_output = list() reg_output = list() net_output = list() error_output = list() remote_servers = list() if yara_folder and has_yara: yara_rules = yara_import_rules(yara_folder) else: yara_rules = '' # Use fileinput.input() now to read data line-by-line for original_line in fileinput.input( csv_file, openhook=fileinput.hook_encoded('iso-8859-1')): server = '' if original_line[ 0] != '"': # Ignore lines that begin with Tab. Sysinternals breaks CSV with new processes continue line = original_line.strip(whitespace + '"') field = line.strip().split('","') try: if field[3] in ['Process Create'] and field[5] == 'SUCCESS': cmdline = field[6].split('Command line: ')[1] if not blacklist_scan(cmd_blacklist, field): if generalize_paths: cmdline = generalize_var(cmdline) child_pid = field[6].split('PID: ')[1].split(',')[0] outputtext = '[CreateProcess] %s:%s > "%s"\t[Child PID: %s]' % ( field[1], field[2], cmdline.replace('"', ''), child_pid) timelinetext = '%s,Process,CreateProcess,%s,%s,%s,%s' % ( field[0].split()[0].split('.')[0], field[1], field[2], cmdline.replace('"', ''), child_pid) process_output.append(outputtext) timeline.append(timelinetext) elif field[3] == 'CreateFile' and field[5] == 'SUCCESS': if not blacklist_scan(file_blacklist, field): path = field[4] if os.path.isdir(path): if generalize_paths: path = generalize_var(path) outputtext = '[CreateFolder] %s:%s > %s' % ( field[1], field[2], path) timelinetext = '%s,File,CreateFolder,%s,%s,%s' % ( field[0].split()[0].split('.')[0], field[1], field[2], path) file_output.append(outputtext) timeline.append(timelinetext) else: # This is for actual files. It's a huge try/except, sorry. try: md5 = md5_file(path) yara_hits = '' if yara_folder and yara_rules: yara_hits = yara_filescan(path, yara_rules) av_hits = '' if has_virustotal: av_hits = virustotal_scan_file(md5) if generalize_paths: path = generalize_var(path) outputtext = '[CreateFile] %s:%s > %s\t[MD5: %s]%s%s' % ( field[1], field[2], path, md5, yara_hits, av_hits) timelinetext = '%s,File,CreateFile,%s,%s,%s,%s' % ( field[0].split()[0].split('.')[0], field[1], field[2], path, md5) file_output.append(outputtext) timeline.append(timelinetext) except (IndexError, IOError): if generalize_paths: path = generalize_var(path) outputtext = '[CreateFile] %s:%s > %s\t[File no longer exists]' % ( field[1], field[2], path) timelinetext = '%s,File,CreateFile,%s,%s,%s,N/A' % ( field[0].split()[0].split('.')[0], field[1], field[2], path) file_output.append(outputtext) timeline.append(timelinetext) elif field[3] == 'SetDispositionInformationFile' and field[ 5] == 'SUCCESS': if not blacklist_scan(file_blacklist, field): path = field[4] if generalize_paths: path = generalize_var(path) outputtext = '[DeleteFile] %s:%s > %s' % ( field[1], field[2], field[4]) timelinetext = '%s,File,DeleteFile,%s,%s,%s' % ( field[0].split()[0].split('.')[0], field[1], field[2], path) file_output.append(outputtext) timeline.append(timelinetext) elif field[3] == 'SetRenameInformationFile': if not blacklist_scan(file_blacklist, field): from_file = field[4] to_file = field[6].split('FileName: ')[1].strip('"') if generalize_paths: from_file = generalize_var(from_file) to_file = generalize_var(to_file) outputtext = '[RenameFile] %s:%s > %s => %s' % ( field[1], field[2], from_file, to_file) timelinetext = '%s,File,RenameFile,%s,%s,%s,%s' % ( field[0].split()[0].split('.')[0], field[1], field[2], from_file, to_file) file_output.append(outputtext) timeline.append(timelinetext) elif field[3] == 'RegCreateKey' and field[5] == 'SUCCESS': if not blacklist_scan(reg_blacklist, field): outputtext = '[RegCreateKey] %s:%s > %s' % ( field[1], field[2], field[4]) if not outputtext in reg_output: # Ignore multiple CreateKeys. Only log the first. timelinetext = '%s,Registry,RegCreateKey,%s,%s,%s' % ( field[0].split()[0].split('.')[0], field[1], field[2], field[4]) reg_output.append(outputtext) timeline.append(timelinetext) elif field[3] == 'RegSetValue' and field[5] == 'SUCCESS': if not blacklist_scan(reg_blacklist, field): reg_length = field[6].split('Length:')[1].split( ',')[0].strip(whitespace + '"') if int(reg_length): data_field = field[6].split('Data:')[1].strip( whitespace + '"') if len(data_field.split(' ')) == 16: data_field += ' ...' outputtext = '[RegSetValue] %s:%s > %s = %s' % ( field[1], field[2], field[4], data_field) timelinetext = '%s,Registry,RegSetValue,%s,%s,%s,%s' % ( field[0].split()[0].split('.')[0], field[1], field[2], field[4], data_field) reg_output.append(outputtext) timeline.append(timelinetext) elif field[3] == 'RegDeleteValue': # and field[5] == 'SUCCESS': # SUCCESS is commented out to allows all attempted deletions, whether or not the value exists if not blacklist_scan(reg_blacklist, field): outputtext = '[RegDeleteValue] %s:%s > %s' % ( field[1], field[2], field[4]) timelinetext = '%s,Registry,RegDeleteValue,%s,%s,%s' % ( field[0].split()[0].split('.')[0], field[1], field[2], field[4]) reg_output.append(outputtext) timeline.append(timelinetext) elif field[3] == 'RegDeleteKey': # and field[5] == 'SUCCESS': # SUCCESS is commented out to allows all attempted deletions, whether or not the value exists if not blacklist_scan(reg_blacklist, field): outputtext = '[RegDeleteKey] %s:%s > %s' % ( field[1], field[2], field[4]) timelinetext = '%s,Registry,RegDeleteKey,%s,%s,%s' % ( field[0].split()[0].split('.')[0], field[1], field[2], field[4]) reg_output.append(outputtext) timeline.append(timelinetext) elif field[3] == 'UDP Send' and field[5] == 'SUCCESS': if not blacklist_scan(net_blacklist, field): server = field[4].split('-> ')[1] # TODO: work on this later, once I can verify it better. #if field[6] == 'Length: 20': # output_line = '[DNS Query] %s:%s > %s' % (field[1], field[2], protocol_replace(server)) #else: outputtext = '[UDP] %s:%s > %s' % ( field[1], field[2], protocol_replace(server)) if not outputtext in net_output: timelinetext = '%s,Network,UDP Send,%s,%s,%s' % ( field[0].split()[0].split('.')[0], field[1], field[2], protocol_replace(server)) net_output.append(outputtext) timeline.append(timelinetext) elif field[3] == 'UDP Receive' and field[5] == 'SUCCESS': if not blacklist_scan(net_blacklist, field): server = field[4].split('-> ')[1] outputtext = '[UDP] %s > %s:%s' % ( protocol_replace(server), field[1], field[2]) if not outputtext in net_output: timelinetext = '%s,Network,UDP Receive,%s,%s' % ( field[0].split()[0].split('.')[0], field[1], field[2]) net_output.append(outputtext) timeline.append(timelinetext) elif field[3] == 'TCP Send' and field[5] == 'SUCCESS': if not blacklist_scan(net_blacklist, field): server = field[4].split('-> ')[1] outputtext = '[TCP] %s:%s > %s' % ( field[1], field[2], protocol_replace(server)) if not outputtext in net_output: timelinetext = '%s,Network,TCP Send,%s,%s,%s' % ( field[0].split()[0].split('.')[0], field[1], field[2], protocol_replace(server)) net_output.append(outputtext) timeline.append(timelinetext) elif field[3] == 'TCP Receive' and field[5] == 'SUCCESS': if not blacklist_scan(net_blacklist, field): server = field[4].split('-> ')[1] outputtext = '[TCP] %s > %s:%s' % ( protocol_replace(server), field[1], field[2]) if not outputtext in net_output: timelinetext = '%s,Network,TCP Receive,%s,%s' % ( field[0].split()[0].split('.')[0], field[1], field[2]) net_output.append(outputtext) timeline.append(timelinetext) except IndexError: if debug: sys.stderr.write(line) sys.stderr.write(format_exc()) error_output.append(original_line.strip()) # Enumerate unique remote hosts into their own section if server: server = server.split(':')[0] if not server in remote_servers and server != 'localhost': remote_servers.append(server) #} End of file input processing report.append('Processes Created:') report.append('==================') for event in process_output: report.append(event) report.append('') report.append('File Activity:') report.append('==================') for event in file_output: report.append(event) report.append('') report.append('Registry Activity:') report.append('==================') for event in reg_output: report.append(event) report.append('') report.append('Network Traffic:') report.append('==================') for event in net_output: report.append(event) report.append('') report.append('Unique Hosts:') report.append('==================') for server in sorted(remote_servers): report.append(protocol_replace(server).strip()) if error_output: report.append('\r\n\r\n\r\n\r\n\r\n\r\nERRORS DETECTED') report.append('The following items could not be parsed correctly:') for error in error_output: report.append(error)
def carregaDorks(): linhas = input(openhook=hook_encoded("ISO-8859-1")) dorks = [] for linha in linhas: dorks.append(removeCRLF(linha)) return dorks
def check(errors, expected_lines): with FileInput(files=TESTFN, mode='r', openhook=hook_encoded('utf-8', errors=errors)) as fi: lines = list(fi) self.assertEqual(lines, expected_lines)
def check(mode, expected_lines): with FileInput(files=TESTFN, mode=mode, openhook=hook_encoded('utf-7')) as fi: lines = list(fi) self.assertEqual(lines, expected_lines)
def main(args): p = argparse.ArgumentParser(description=__doc__) p.add_argument("-n", "--lines", default=10, type=int, help="""print the first K lines instead of 10; if negative, print the last -K lines""") p.add_argument("-q", "--quiet", "--silent", action='store_true', help="never print headers for each file") p.add_argument("-v", "--verbose", action='store_true', help="always print headers for each file") p.add_argument("files", action="store", nargs="*", help="files to print") ns = p.parse_args(args) status = 0 header_fmt = '==> {} <==\n' if len(ns.files) == 0: ns.files = ['-'] try: for fname in ns.files: if ns.verbose or (len(ns.files) > 1 and not ns.quiet): if fname == '-': print(header_fmt.format('standard input'), end='') else: print(header_fmt.format(fname), end='') fileinput.close() inp = fileinput.input(fname, openhook=fileinput.hook_encoded("utf-8")) if ns.lines >= 0: buf = [] for i, line in enumerate(inp): if i >= ns.lines: break buf.append(line) for line in buf: print(line, end='') else: buf = [] for line in fileinput.input( inp, openhook=fileinput.hook_encoded("utf-8")): buf.append(line) if len(buf) > -ns.lines: del buf[0] for line in buf: print(line, end='') except Exception as e: print('head :%s' % str(e)) status = 1 finally: fileinput.close() sys.exit(status)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--output", "-o", help="write output to file instead of stdout") parser.add_argument( "--split", "-s", help= "if writing to file, split into multiple files with this many lines per " "file", type=int, default=0, ) parser.add_argument( "--extra-field", "-e", help="extra fields to include. Provide a field name and a pointer to " "the field. Example: -e verified user.verified", nargs=2, action="append", ) parser.add_argument("--excel", "-x", help="create file compatible with Excel", action="store_true") parser.add_argument( "files", metavar="FILE", nargs="*", help="files to read, if empty, stdin is used", ) args = parser.parse_args() file_count = 1 csv_file = None if args.output: if args.split: csv_file = codecs.open(numbered_filepath(args.output, file_count), "wb", "utf-8") file_count += 1 else: csv_file = codecs.open(args.output, "wb", "utf-8") else: csv_file = sys.stdout sheet = csv.writer(csv_file) extra_headings = [] extra_fields = [] if args.extra_field: for heading, field in args.extra_field: extra_headings.append(heading) extra_fields.append(field) sheet.writerow(get_headings(extra_headings=extra_headings)) files = args.files if len(args.files) > 0 else ("-", ) for count, line in enumerate( fileinput.input(files, openhook=fileinput.hook_encoded("utf-8"))): if args.split and count and count % args.split == 0: csv_file.close() csv_file = codecs.open(numbered_filepath(args.output, file_count), "wb", "utf-8") sheet = csv.writer(csv_file) sheet.writerow(get_headings(extra_headings=extra_headings)) file_count += 1 tweet = json.loads(line) sheet.writerow( get_row(tweet, extra_fields=extra_fields, excel=args.excel))
def labels_nl(self, fname = LABELS_NL): ''' <http://nl.dbpedia.org/resource/Aannemer> <http://www.w3.org/2000/01/rdf-schema#label> "Aannemer"@nl . "lastpart" : { "type" : "string"}, "lastpart_str" : { "type" : "string", "index": "not_analyzed" }, "pref_title" : {"type" : "string"}, "pref_title_str" : {"type" :"string", "index" : "not_analyzed"}, "title" : { "type" : "string"}, "title_str" : {"type" : "string", "index" : "not_analyzed" }, "org_title" : {"type" : "string"}, "org_title_str" : {"type" : "string", "index" : "not_analyzed"}, ''' INPUT_RE_STR = { 'id_nl' : REGEX_LIST['id_nl'], 'label' : REGEX_LIST['label'], } DISAMBIG = ['doorverwijspagina', 'disambiguation'] self.type_op = "update" # Type of operation for ES input_re = {} dbpedia_obj = {} total_found = 0 total_not_found = 0 for regex in INPUT_RE_STR: input_re[regex] = [] for rule in INPUT_RE_STR[regex]: input_re[regex].append(re.compile(rule)) self.commit_buffer = [] self.commit_total = 0 self.commit = 0 for line in fileinput.input(files=[fname], openhook=fileinput.hook_encoded("utf-8")): obj = {} for regex in input_re: for reg in input_re[regex]: key = value = None match_obj = reg.match(line) if not match_obj: continue key = regex value = match_obj.group(1) obj[key] = value ''' for item in DISAMBIG: if value and value.find(item) > -1: continue disambig = 0 if value and value.find('(') > -1: disambig = 1 ''' if obj: res = ES.search(index=ES_INDEX_NAME, q='id_nl:"%s"' % obj['id_nl']) if not res.get('hits').get('total') == 1: self.no_id_found += 1 print(res.get('hits').get('total')) elif res.get('hits').get('total') == 1: obj['id'] = res.get('hits').get('hits')[0].get('_id') obj['lastpart'] = obj['lastpart_str'] = normalize(value).split('(')[0].strip().split(' ')[-1] obj['pref_title'] = obj['pref_title_str'] = value obj['title'] = obj['title_str'] = normalize(value) obj['org_title'] = obj['org_title_str'] = value self.commit_buffer.append(obj) self.commit += 1 if self._check_commit(): break
print(str(directory)) print(str(file)) #convert to html in temp file f = open(file, 'rb') b = open(temp, 'wb') document = mammoth.convert_to_html(f) b.write(document.value.encode('utf8')) b.close() f.close() #create a find-and-replace statement i = 0 c = len(find) string = "x = line" while i < c: x = ".replace(find[" + str(i) + "], replace[" + str(i) + "])" string = string + x i = i + 1 #write output file f = open(output, 'wb') with fileinput.FileInput(temp, inplace=False, openhook=fileinput.hook_encoded('utf-8', 'surrogateescape')) as file: for line in file: #execute the find-and-replace statement exec(string) f.write(x.encode('utf-8')) f.close() #remove temp file os.remove(temp)