def write(self): dockerfile = os.path.join(self.image_dir, 'Dockerfile') os.makedirs(self.image_dir, exist_ok=True) if os.path.exists(dockerfile): os.remove(dockerfile) with open(dockerfile, 'x'): pass snippets = [] snippets.extend(self.snippets) snippets.append( Snippet(os.path.join(docker_path, 'snippets/workdir.in'))) snippets.append( Snippet(os.path.join(docker_path, 'snippets/entrypoint.in'))) for snippet in snippets: with open(dockerfile, 'a') as f: try: f.write(snippet.substitute(self.substitutions)) except Exception as e: raise RuntimeError( 'Error creating Dockerfile for {}: {}'.format( self.name, e)) for file in self.files: shutil.copy(file, self.image_dir)
def mixGPAll(snippets, params): logger.mixer("Starting mixGPAll mix method") combined = Snippet("combined") # copy a few things, assume first snippet has the defaults combined.optSteps = snippets[0].optSteps combined.learningRate = snippets[0].learningRate combined.lossTolerance = snippets[0].lossTolerance logger.mixer("Adding training data") # add training data for snippet in snippets: for tdata in snippet.data: combined.addData(tdata) # train the new snippet logger.mixer("Training new snippet") combined.train() # sample the stuff logger.mixer("Sampling new snippet") sampler = Rejection(combined, params["x0"], n=20) sampler.start() # this is threaded so we'll join then pull the results out sampler.join() return sampler.results
def parse_wrap(nd): """This parses one section with class="wrap" (it builds a package) """ # Title has package name and version title = '' a = nd.find('.//div/div/div/h2/a') if a is None or a.tail is None: return # Parse the section title title = a.tail.strip() title = re.sub('\s+', ' ', title) # Patch title = title.replace('xml::parser', 'xml-parser') # Extract build instructions snippets = [] divs = nd.findall('./div') with_pkg = False sbu = sz = None for div in divs: # A section may have different (sub-)classes if 'class' in div.attrib: kl = div.attrib['class'] if kl == 'package': sbu, sz = parse_package(div) # print(f'{title}, {sbu} SBU, {sz} GB') with_pkg = True continue # sect2 is used for the 8.4 grub section if kl not in ['installation', 'configuration', 'sect2']: continue for pre in div.findall('.//pre'): s = '' if pre.text: t = pre.text.strip() s += t if 'class' in pre.attrib: kl = pre.attrib['class'] # Note: <pre class="root" has only been found in sect1 sections if kl in ['userinput', 'root']: # This is a code snippet kbds = pre.findall('.//kbd[@class="command"]') s += get_kbds_script(kbds) if len(s) > 0: snippets.append(Snippet('userinput', s)) elif kl == 'screen': # This is some program output s += get_em_code(pre) if len(s) > 0: snippets.append(Snippet('screen', s)) # Only create section object if there are instructions if len(snippets) > 0: return Section(title, snippets, with_pkg=with_pkg, sbu=sbu, sz=sz)
def on_cell_edited(self, cell, path, new_text): if new_text != '': piter = self.model.get_iter(path) node = self.model.get_value(piter, self.SNIPPET_COLUMN) if node: if node == self.snippet.data: s = self.snippet else: s = Snippet(node) s['description'] = new_text self.snippet_changed(piter) self.select_iter(piter) else: # This is the `Add a new snippet...` item # We create a new snippet snippet = self.new_snippet({'description': new_text}) if snippet: self.model.set_value(piter, self.SNIPPET_COLUMN, snippet.data) self.snippet_changed(piter) self.snippet = snippet self.selection_changed()
def new_snippet(self, properties=None): if not self.language_path: return None snippet = Library().new_snippet(self.get_language(self.language_path), properties) return Snippet(snippet)
def snippet_changed(self, piter=None): if piter: node = self.model.get_value(piter, self.SNIPPET_COLUMN) s = Snippet(node) else: s = self.snippet piter = self.find_iter(self.model.get_iter(self.language_path), s) if piter: nm = s.display() self.model.set_value(piter, self.NAME_COLUMN, nm) self.model.set_value(piter, self.SORT_COLUMN, nm) self.update_toolbar_buttons() self.entry_tab_trigger_update_valid() return piter
def snippet_changed(self, piter = None): if piter: node = self.model.get_value(piter, self.SNIPPET_COLUMN) s = Snippet(node) else: s = self.snippet piter = self.find_iter(self.model.get_iter(self.language_path), s) if piter: nm = s.display() self.model.set_value(piter, self.NAME_COLUMN, nm) self.model.set_value(piter, self.SORT_COLUMN, nm) self.update_toolbar_buttons() self.entry_tab_trigger_update_valid() return piter
def parse_snippets(self, path: str) -> Type[HeaderFile]: result = HeaderFile(path) snippet = None group = [] for line_num, line in enumerate(readlines(path), 1): if line.startswith(BEGIN_TAG): dprint(f'Process {line}') key = line[len(BEGIN_TAG):].strip().strip('`') if snippet is not None: raise Exception(f'Unexpected open section ' f'({key}) {path}:{line_num} {line}') snippet = Snippet(key, enabled=key.startswith(ALWAYS_INCLUDE_TAG)) elif line.startswith(END_TAG): dprint(f'Process {line}') key = line[len(END_TAG):].strip().strip('`') if snippet is None or snippet.tag != key: raise Exception(f'Unexpected close section ' f'({key}) {path}:{line_num} {line}') if key in self.snippets: raise Exception(f'Duplicated key found ' f'({key}) {path}:{line_num}') snippet.body = '\n'.join(group) self.snippets[snippet.tag] = snippet result.add_snippet(snippet) snippet = None group = [] elif line.startswith(REQUIRED_TAG) and snippet is not None: for dependency in filter( None, map(str.strip, line[len(REQUIRED_TAG):].split(','))): dprint(f'{snippet.tag} depends on `{dependency}`') if dependency not in self.snippets: raise Exception(f'Unknown dependency {dependency} ' f'for snippet {snippet.tag}' f'({key}) {path}:{line_num}') snippet.add_dependency(self.snippets[dependency]) elif snippet is not None: group.append(line) if snippet is not None: raise Exception(f'Missed last closing tag {path} ({snippet.tag})') return result
def addSnippet(self, name, paramInfo=None): if name not in self.snippets: self.snippets[name] = Snippet(name, paramInfo) return DSStatus(code=0, message="Added Snippet: {0}".format(name)) else: return DSStatus( code=-1, message="Failed to add Snippet with name {0}. Snippet already exists.".format( name ), )
def __init__(self, name, snippets, files, substitutions): self.name = name self.snippets = [ Snippet(os.path.join(docker_path, 'snippets', snippet + '.in')) for snippet in snippets ] self.substitutions = substitutions self.files = files self.status = None self.image_dir = os.path.join(images_dir, self.name) self.image_name = 'gulinux/planetaryimager_build:{}'.format(self.name)
def recite(button): # TODO: randomly select unique Snippet # TODO: show reference # TODO: if practice, show contents global g_verse g_verse = Snippet.random() verse.set_text(g_verse.contents) reference.set_text(g_verse.reference) user_input.set_edit_text(u"")
def from_json_decoded(cls, obj): """Return an LFSBook object from a json-decoded object.""" d = {} # We iterate on the members actually present, ignoring absent ones. for k, v in obj.items(): d[k] = v # Properties with non-json-serializable values if 'snippets' in obj: d['snippets'] = [ Snippet.from_json_decoded(x) for x in obj['snippets'] ] return cls(**d)
def parse_sect1(nd): # Title has package name and version title = '' a = nd.find('.//div/div/div/h2/a') if a is None or a.tail is None: return # Parse the section title title = a.tail.strip() title = re.sub('\s+', ' ', title) # Extract build instructions snippets = [] for pre in nd.findall('.//pre'): s = '' if pre.text: t = pre.text.strip() s += t if 'class' in pre.attrib: kl = pre.attrib['class'] # Note: <pre class="root" has only been found in sect1 sections if kl in ['userinput', 'root']: # This is a code snippet kbds = pre.findall('.//kbd[@class="command"]') s += get_kbds_script(kbds) if len(s) > 0: snippets.append(Snippet('userinput', s)) elif kl == 'screen': # This is some program output s += get_em_code(pre) if len(s) > 0: snippets.append(Snippet('screen', s)) # Only create section object if there are instructions if len(snippets) > 0: return Section(title, snippets, with_pkg=False)
class Proposal(GObject.Object, GtkSource.CompletionProposal): __gtype_name__ = "GeditSnippetsProposal" def __init__(self, snippet): GObject.Object.__init__(self) self._snippet = Snippet(snippet) def snippet(self): return self._snippet.data # Interface implementation def do_get_markup(self): return self._snippet.display() def do_get_info(self): return self._snippet.data['text']
def on_tree_view_selection_changed(self, selection): parent, piter, node = self.selected_snippet() if self.snippet: self.on_entry_tab_trigger_focus_out(self['entry_tab_trigger'], None) self.on_source_view_snippet_focus_out(self['source_view_snippet'], None) self.on_entry_drop_targets_focus_out( self['combo_drop_targets'].get_child(), None) self.update_language_path() if node: self.snippet = Snippet(node) else: self.snippet = None self.selection_changed()
def find_snippet(filename: str = Constants.PYTHON_SNIPPETS_FILE) -> List[Snippet]: with open(filename) as snippets_file: snippets_db = [] snippet = Snippet() for line in snippets_file.readlines(): if "snippet " in line: if (snippet.name and snippet.prefix) is not None: snippets_db.append(copy.deepcopy(snippet)) snippet.clean() snippet.name = line.replace("snippet", "").lstrip().rstrip() snippet.prefix = snippet.name.split()[0] continue if line.find("#") == 0: continue clean_line = line.replace("\t", "", 1).replace("\n", "") if line: snippet.body.append(clean_line) return snippets_db
def __init__(self, snippet_limit: int = 3, take_words: int = 3, results_limit: int = 10, data_dir: str = 'pages') -> None: """ This class is used for basic, no SQL searching. :param take_words: How many words to display is snippets before/after the found word. :param results_limit: How many rows to show when displaying top results with snippets. :param data_dir: Directory in which the HTML sites are saved relative to the current directory. """ # nltk.download('punkt') # nltk.download('stopwords') self.time_needed_to_search = None self.take_words = take_words self.results_limit = results_limit self.data_dir = os.path.join(os.getcwd(), data_dir) self.path = data_dir self.snippet_limit = snippet_limit self.snip = Snippet(self.take_words, self.results_limit) self.stop_words_slovene = set(stopwords.words("Slovene")).union({ "ter", "nov", "novo", "nova", "zato", "še", "zaradi", "a", "ali", "april", "avgust", "b", "bi", "bil", "bila", "bile", "bili", "bilo", "biti", "blizu", "bo", "bodo", "bojo", "bolj", "bom", "bomo", "boste", "bova", "boš", "brez", "c", "cel", "cela", "celi", "celo", "d", "da", "daleč", "dan", "danes", "datum", "december", "deset", "deseta", "deseti", "deseto", "devet", "deveta", "deveti", "deveto", "do", "dober", "dobra", "dobri", "dobro", "dokler", "dol", "dolg", "dolga", "dolgi", "dovolj", "drug", "druga", "drugi", "drugo", "dva", "dve", "e", "eden", "en", "ena", "ene", "eni", "enkrat", "eno", "etc.", "f", "februar", "g", "g.", "ga", "ga.", "gor", "gospa", "gospod", "h", "halo", "i", "idr.", "ii", "iii", "in", "iv", "ix", "iz", "j", "januar", "jaz", "je", "ji", "jih", "jim", "jo", "julij", "junij", "jutri", "k", "kadarkoli", "kaj", "kajti", "kako", "kakor", "kamor", "kamorkoli", "kar", "karkoli", "katerikoli", "kdaj", "kdo", "kdorkoli", "ker", "ki", "kje", "kjer", "kjerkoli", "ko", "koder", "koderkoli", "koga", "komu", "kot", "kratek", "kratka", "kratke", "kratki", "l", "lahka", "lahke", "lahki", "lahko", "le", "lep", "lepa", "lepe", "lepi", "lepo", "leto", "m", "maj", "majhen", "majhna", "majhni", "malce", "malo", "manj", "marec", "me", "med", "medtem", "mene", "mesec", "mi", "midva", "midve", "mnogo", "moj", "moja", "moje", "mora", "morajo", "moram", "moramo", "morate", "moraš", "morem", "mu", "n", "na", "nad", "naj", "najina", "najino", "najmanj", "naju", "največ", "nam", "narobe", "nas", "nato", "nazaj", "naš", "naša", "naše", "ne", "nedavno", "nedelja", "nek", "neka", "nekaj", "nekatere", "nekateri", "nekatero", "nekdo", "neke", "nekega", "neki", "nekje", "neko", "nekoga", "nekoč", "ni", "nikamor", "nikdar", "nikjer", "nikoli", "nič", "nje", "njega", "njegov", "njegova", "njegovo", "njej", "njemu", "njen", "njena", "njeno", "nji", "njih", "njihov", "njihova", "njihovo", "njiju", "njim", "njo", "njun", "njuna", "njuno", "no", "nocoj", "november", "npr.", "o", "ob", "oba", "obe", "oboje", "od", "odprt", "odprta", "odprti", "okoli", "oktober", "on", "onadva", "one", "oni", "onidve", "osem", "osma", "osmi", "osmo", "oz.", "p", "pa", "pet", "peta", "petek", "peti", "peto", "po", "pod", "pogosto", "poleg", "poln", "polna", "polni", "polno", "ponavadi", "ponedeljek", "ponovno", "potem", "povsod", "pozdravljen", "pozdravljeni", "prav", "prava", "prave", "pravi", "pravo", "prazen", "prazna", "prazno", "prbl.", "precej", "pred", "prej", "preko", "pri", "pribl.", "približno", "primer", "pripravljen", "pripravljena", "pripravljeni", "proti", "prva", "prvi", "prvo", "r", "ravno", "redko", "res", "reč", "s", "saj", "sam", "sama", "same", "sami", "samo", "se", "sebe", "sebi", "sedaj", "sedem", "sedma", "sedmi", "sedmo", "sem", "september", "seveda", "si", "sicer", "skoraj", "skozi", "slab", "smo", "so", "sobota", "spet", "sreda", "srednja", "srednji", "sta", "ste", "stran", "stvar", "sva", "t", "ta", "tak", "taka", "take", "taki", "tako", "takoj", "tam", "te", "tebe", "tebi", "tega", "težak", "težka", "težki", "težko", "ti", "tista", "tiste", "tisti", "tisto", "tj.", "tja", "to", "toda", "torek", "tretja", "tretje", "tretji", "tri", "tu", "tudi", "tukaj", "tvoj", "tvoja", "tvoje", "u", "v", "vaju", "vam", "vas", "vaš", "vaša", "vaše", "ve", "vedno", "velik", "velika", "veliki", "veliko", "vendar", "ves", "več", "vi", "vidva", "vii", "viii", "visok", "visoka", "visoke", "visoki", "vsa", "vsaj", "vsak", "vsaka", "vsakdo", "vsake", "vsaki", "vsakomur", "vse", "vsega", "vsi", "vso", "včasih", "včeraj", "x", "z", "za", "zadaj", "zadnji", "zakaj", "zaprta", "zaprti", "zaprto", "zdaj", "zelo", "zunaj", "č", "če", "često", "četrta", "četrtek", "četrti", "četrto", "čez", "čigav", "š", "šest", "šesta", "šesti", "šesto", "štiri", "ž", "že", "svoj", "jesti", "imeti", "\u0161e", "iti", "kak", "www", "km", "eur", "pač", "del", "kljub", "šele", "prek", "preko", "znova", "morda", "kateri", "katero", "katera", "ampak", "lahek", "lahka", "lahko", "morati", "torej", "gl", "xsd", "ipd", "om", "gt", "lt", "d.o.o" })
class BasicSearch: def __init__(self, snippet_limit: int = 3, take_words: int = 3, results_limit: int = 10, data_dir: str = 'pages') -> None: """ This class is used for basic, no SQL searching. :param take_words: How many words to display is snippets before/after the found word. :param results_limit: How many rows to show when displaying top results with snippets. :param data_dir: Directory in which the HTML sites are saved relative to the current directory. """ # nltk.download('punkt') # nltk.download('stopwords') self.time_needed_to_search = None self.take_words = take_words self.results_limit = results_limit self.data_dir = os.path.join(os.getcwd(), data_dir) self.path = data_dir self.snippet_limit = snippet_limit self.snip = Snippet(self.take_words, self.results_limit) self.stop_words_slovene = set(stopwords.words("Slovene")).union({ "ter", "nov", "novo", "nova", "zato", "še", "zaradi", "a", "ali", "april", "avgust", "b", "bi", "bil", "bila", "bile", "bili", "bilo", "biti", "blizu", "bo", "bodo", "bojo", "bolj", "bom", "bomo", "boste", "bova", "boš", "brez", "c", "cel", "cela", "celi", "celo", "d", "da", "daleč", "dan", "danes", "datum", "december", "deset", "deseta", "deseti", "deseto", "devet", "deveta", "deveti", "deveto", "do", "dober", "dobra", "dobri", "dobro", "dokler", "dol", "dolg", "dolga", "dolgi", "dovolj", "drug", "druga", "drugi", "drugo", "dva", "dve", "e", "eden", "en", "ena", "ene", "eni", "enkrat", "eno", "etc.", "f", "februar", "g", "g.", "ga", "ga.", "gor", "gospa", "gospod", "h", "halo", "i", "idr.", "ii", "iii", "in", "iv", "ix", "iz", "j", "januar", "jaz", "je", "ji", "jih", "jim", "jo", "julij", "junij", "jutri", "k", "kadarkoli", "kaj", "kajti", "kako", "kakor", "kamor", "kamorkoli", "kar", "karkoli", "katerikoli", "kdaj", "kdo", "kdorkoli", "ker", "ki", "kje", "kjer", "kjerkoli", "ko", "koder", "koderkoli", "koga", "komu", "kot", "kratek", "kratka", "kratke", "kratki", "l", "lahka", "lahke", "lahki", "lahko", "le", "lep", "lepa", "lepe", "lepi", "lepo", "leto", "m", "maj", "majhen", "majhna", "majhni", "malce", "malo", "manj", "marec", "me", "med", "medtem", "mene", "mesec", "mi", "midva", "midve", "mnogo", "moj", "moja", "moje", "mora", "morajo", "moram", "moramo", "morate", "moraš", "morem", "mu", "n", "na", "nad", "naj", "najina", "najino", "najmanj", "naju", "največ", "nam", "narobe", "nas", "nato", "nazaj", "naš", "naša", "naše", "ne", "nedavno", "nedelja", "nek", "neka", "nekaj", "nekatere", "nekateri", "nekatero", "nekdo", "neke", "nekega", "neki", "nekje", "neko", "nekoga", "nekoč", "ni", "nikamor", "nikdar", "nikjer", "nikoli", "nič", "nje", "njega", "njegov", "njegova", "njegovo", "njej", "njemu", "njen", "njena", "njeno", "nji", "njih", "njihov", "njihova", "njihovo", "njiju", "njim", "njo", "njun", "njuna", "njuno", "no", "nocoj", "november", "npr.", "o", "ob", "oba", "obe", "oboje", "od", "odprt", "odprta", "odprti", "okoli", "oktober", "on", "onadva", "one", "oni", "onidve", "osem", "osma", "osmi", "osmo", "oz.", "p", "pa", "pet", "peta", "petek", "peti", "peto", "po", "pod", "pogosto", "poleg", "poln", "polna", "polni", "polno", "ponavadi", "ponedeljek", "ponovno", "potem", "povsod", "pozdravljen", "pozdravljeni", "prav", "prava", "prave", "pravi", "pravo", "prazen", "prazna", "prazno", "prbl.", "precej", "pred", "prej", "preko", "pri", "pribl.", "približno", "primer", "pripravljen", "pripravljena", "pripravljeni", "proti", "prva", "prvi", "prvo", "r", "ravno", "redko", "res", "reč", "s", "saj", "sam", "sama", "same", "sami", "samo", "se", "sebe", "sebi", "sedaj", "sedem", "sedma", "sedmi", "sedmo", "sem", "september", "seveda", "si", "sicer", "skoraj", "skozi", "slab", "smo", "so", "sobota", "spet", "sreda", "srednja", "srednji", "sta", "ste", "stran", "stvar", "sva", "t", "ta", "tak", "taka", "take", "taki", "tako", "takoj", "tam", "te", "tebe", "tebi", "tega", "težak", "težka", "težki", "težko", "ti", "tista", "tiste", "tisti", "tisto", "tj.", "tja", "to", "toda", "torek", "tretja", "tretje", "tretji", "tri", "tu", "tudi", "tukaj", "tvoj", "tvoja", "tvoje", "u", "v", "vaju", "vam", "vas", "vaš", "vaša", "vaše", "ve", "vedno", "velik", "velika", "veliki", "veliko", "vendar", "ves", "več", "vi", "vidva", "vii", "viii", "visok", "visoka", "visoke", "visoki", "vsa", "vsaj", "vsak", "vsaka", "vsakdo", "vsake", "vsaki", "vsakomur", "vse", "vsega", "vsi", "vso", "včasih", "včeraj", "x", "z", "za", "zadaj", "zadnji", "zakaj", "zaprta", "zaprti", "zaprto", "zdaj", "zelo", "zunaj", "č", "če", "često", "četrta", "četrtek", "četrti", "četrto", "čez", "čigav", "š", "šest", "šesta", "šesti", "šesto", "štiri", "ž", "že", "svoj", "jesti", "imeti", "\u0161e", "iti", "kak", "www", "km", "eur", "pač", "del", "kljub", "šele", "prek", "preko", "znova", "morda", "kateri", "katero", "katera", "ampak", "lahek", "lahka", "lahko", "morati", "torej", "gl", "xsd", "ipd", "om", "gt", "lt", "d.o.o" }) def print_results(self, query: list, results: list) -> None: """ Prints the results of the query to the std output :param query: The user search query in list format. :param results: A list of lists containing the results in format [frequencies, document, snippets]. The list should be ordered in descending order. """ print(f'Results for query: "{" ".join(query)}"') print('{} results found in {:.0f}s'.format(len(results), self.time_needed_to_search)) print("{:<12} {:<40} {}".format('Frequencies', 'Document', 'Snippets')) print("{} {} {}".format('-' * 12, '-' * 40, '-' * 80)) for i in range(min(self.results_limit, len(results))): print("{:<12} {:<40} {}".format( results[i][0], results[i][1], '... ' + ' ... '.join(results[i][2][:self.snippet_limit]) + ' ...')) def get_files(self) -> list: """ Gets a list of paths to all HTML files which are contained in the self.path folder. :return: List of all HTML files. """ files_path = [ os.path.join(dp, f) for dp, dn, filenames in os.walk(self.path) for f in filenames if os.path.splitext(f)[1].lower() == '.html' ] return files_path def get_document(self, page_name: str) -> object: """ Gets and opens the provided document. :param page_name: The HTML document that should be opened for reading. :return: The opened document. """ map_name = '.'.join(page_name.split('.')[:3]) return open(os.path.join(self.data_dir, "{}/{}".format(map_name, page_name)), 'r', encoding='utf-8') def parse_file(self, file: str, search_words: list) -> tuple: """ Searches the HTML document for the words in the query and returns the frequency and snippets. :param file: The HTML file to search. :param search_words: The user search query. :return: A touple with frequencies and result snippets. """ # open the file f = codecs.open(file, 'r', 'utf-8') soup = BeautifulSoup(f.read(), features="html.parser") for script in soup(["script", "style"]): script.decompose() # tokenize and clean file strips = list(soup.stripped_strings) document = ' '.join(strips) tokens = word_tokenize(document, language='Slovene', preserve_line=False) indexes = list() for i in range(len(tokens)): token = tokens[i].lower().replace("'", "").replace("'", '').replace( '`', '').replace('·', '') # skip if token doesn't contain letters if not re.search('[a-žA-ž]', token): continue # skip tokens of length 1, if they're not a word or number if len(token) == 1 and not re.match("^[A-Ža-ž0-9]*$", token): continue # if token ends with a special character, remove it if len(token) >= 2 and not re.match("^[A-Ža-ž0-9]*$", token[-1]): token = token[:-1] # if token starts with a special character, remove it if len(token) >= 2 and not re.match("^[A-Ža-ž0-9]*$", token[0]): token = token[1:] # if token is a stop-word, continue if token not in self.stop_words_slovene: # if we're searching for this token, add the index to the list if token in search_words: indexes.append(i) try: # snippet = self.get_snip(tokens, indexes) snippet = self.snip.get_snip(tokens, indexes) except AssertionError: snippet = [] return len(indexes), snippet def search(self, words: list) -> None: """ Performs the search in all files for the user query. :param words: The user search query. """ results = [] # start the timer and do the search, after that calculate the time needed for search start_time = time.time() files = self.get_files() for file in files: f, s = self.parse_file(file, words) if f > 0: results.append([f, ntpath.split(file)[1], s]) end_time = time.time() self.time_needed_to_search = end_time - start_time results.sort(key=lambda x: x[0], reverse=True) self.print_results(words, results)
def musicThumbnailing(x, Fs, shortTermSize=1.0, shortTermStep=0.5, thumbnailSize=10.0, Limit1=0, Limit2=1): # self-similarity matrix x = audioBasicIO.stereo2mono(x) stFeatures = aF.stFeatureExtraction(x, Fs, Fs * shortTermSize, Fs * shortTermStep) S = aS.selfSimilarityMatrix(stFeatures) # moving filter: M = int(round(thumbnailSize / shortTermStep)) B = np.eye(M, M) S = scipy.signal.convolve2d(S, B, 'valid') MIN = np.min(S) # post-processing (remove main diagonal elements) for i in range(S.shape[0]): for j in range(S.shape[1]): if abs(i - j) < 5.0 / shortTermStep or i > j: S[i, j] = MIN # find max position: S[0:int(Limit1 * S.shape[0]), :] = MIN S[:, 0:int(Limit1 * S.shape[0])] = MIN S[int(Limit2 * S.shape[0])::, :] = MIN S[:, int(Limit2 * S.shape[0])::] = MIN matches = [] maxMax = maxVal = np.max(S) i1 = i2 = j1 = j2 = 0 Sbak = np.copy(S) while maxVal > maxMax / 3 * 2 > MIN: # currently arbitrary cutoff [I, J] = np.unravel_index(S.argmax(), S.shape) # expand: i1 = I i2 = I j1 = J j2 = J while i2 - i1 < M: if i1 <= 0 or j1 <= 0 or i2 >= S.shape[0] - 2 or j2 >= S.shape[ 1] - 2: break if S[i1 - 1, j1 - 1] > S[i2 + 1, j2 + 1]: i1 -= 1 j1 -= 1 else: i2 += 1 j2 += 1 S[i1, j1] = S[i2, j2] = MIN # only add to potential matches if we have enough overlap or new record if i2 - i1 >= M: matches.append( Snippet(maxVal, shortTermStep * i1, shortTermStep * i2, shortTermStep * j1, shortTermStep * j2)) S[I, J] = MIN maxVal = np.max(S) return (matches, Sbak)
def fetch_snippets(num_snippets, start_time, end_time, extra_tags): """ Fetches snippets from StackOverflow by looking at 'python' questions in the time-period specified, and retrieving `<pre><code>` code blocks from these questions (and their answers, where applicable) Note that currently the number of snippets returned is not the same as the number requested: it depends on the number of retrieved questions with a code block, as well as the number with an accepted answer with a code block """ filters = load_filter_file() snippets = [] answer_ids = [] questions_retrieved = 0 page_num = 1 while questions_retrieved < num_snippets: questions, has_more = fetch_recent_questions( num_questions=(num_snippets - questions_retrieved), from_time=start_time, to_time=end_time, tags=extra_tags, page_num=page_num, filter_name=filters.Questions) current_time = datetime.utcnow() for q in questions: snippets += [ Snippet(snippet_id=q['question_id'], code=block, url=q['link'], author='stack-overflow', retrieved_at=current_time, additional_url=None) for block in get_snippets(q['body']) if check_source_and_warn(block, q['link']) ] answer_ids.append(q.get('accepted_answer_id', None)) questions_retrieved += len(questions) page_num += 1 if not has_more: warn('No more questions to fetch: Terminating') break # Filter out `None` accepted_answer_id's answer_ids = [str(ans_id) for ans_id in answer_ids if ans_id] log('Retrieving {} accepted answers for analysis...'.format( len(answer_ids))) for i in range(0, len(answer_ids), ANSWER_BATCH_SIZE): batch = answer_ids[i:i + ANSWER_BATCH_SIZE] answers = fetch_answers(batch, filters.Answers) for a in answers: snippets += [ Snippet(snippet_id=a['answer_id'], code=block, url=a['link'], author='stack-overflow', retrieved_at=current_time, additional_url=None) for block in get_snippets(a['body']) if check_source_and_warn(block, q['link']) ] success('Retrieved {} snippets'.format(len(snippets))) return snippets
def apply_snippet(self, snippet, start = None, end = None): if not snippet.valid: return False buf = self.view.get_buffer() s = Snippet(snippet) if not start: start = buf.get_iter_at_mark(buf.get_insert()) if not end: end = buf.get_iter_at_mark(buf.get_selection_bound()) if start.equal(end) and self.uses_current_word(s): # There is no tab trigger and no selection and the snippet uses # the current word. Set start and end to the word boundary so that # it will be removed start, end = buffer_word_boundary(buf) elif start.equal(end) and self.uses_current_line(s): # There is no tab trigger and no selection and the snippet uses # the current line. Set start and end to the line boundary so that # it will be removed start, end = buffer_line_boundary(buf) # Set environmental variables self.update_environment() # You know, we could be in an end placeholder (current, next) = self.next_placeholder() if current and current.__class__ == PlaceholderEnd: self.goto_placeholder(current, None) buf.begin_user_action() # Remove the tag, selection or current word buf.delete(start, end) # Insert the snippet holders = len(self.placeholders) if len(self.active_snippets) == 0: self.first_snippet_inserted() sn = s.insert_into(self, start) self.active_snippets.append(sn) # Put cursor at first tab placeholder keys = filter(lambda x: x > 0, sn.placeholders.keys()) if len(keys) == 0: if 0 in sn.placeholders: self.goto_placeholder(self.active_placeholder, sn.placeholders[0]) else: buf.place_cursor(sn.begin_iter()) else: self.goto_placeholder(self.active_placeholder, sn.placeholders[keys[0]]) if sn in self.active_snippets: # Check if we can get end_iter in view without moving the # current cursor position out of view cur = buf.get_iter_at_mark(buf.get_insert()) last = sn.end_iter() curloc = self.view.get_iter_location(cur) lastloc = self.view.get_iter_location(last) if (lastloc.y + lastloc.height) - curloc.y <= \ self.view.get_visible_rect().height: self.view.scroll_mark_onscreen(sn.end_mark) buf.end_user_action() self.view.grab_focus() return True
def apply_snippet(self, snippet, start=None, end=None, environ={}): if not snippet.valid: return False # Set environmental variables env = self.get_environment() if environ: for k in environ['utf8']: env['utf8'][k] = environ['utf8'][k] for k in environ['noenc']: env['noenc'][k] = environ['noenc'][k] buf = self.view.get_buffer() s = Snippet(snippet, env) if not start: start = buf.get_iter_at_mark(buf.get_insert()) if not end: end = buf.get_iter_at_mark(buf.get_selection_bound()) if start.equal(end) and self.uses_current_word(s): # There is no tab trigger and no selection and the snippet uses # the current word. Set start and end to the word boundary so that # it will be removed start, end = buffer_word_boundary(buf) elif start.equal(end) and self.uses_current_line(s): # There is no tab trigger and no selection and the snippet uses # the current line. Set start and end to the line boundary so that # it will be removed start, end = buffer_line_boundary(buf) # You know, we could be in an end placeholder (current, next) = self.next_placeholder() if current and current.__class__ == PlaceholderEnd: self.goto_placeholder(current, None) buf.begin_user_action() # Remove the tag, selection or current word buf.delete(start, end) # Insert the snippet holders = len(self.placeholders) if len(self.active_snippets) == 0: self.first_snippet_inserted() sn = s.insert_into(self, start) self.active_snippets.append(sn) # Put cursor at first tab placeholder keys = filter(lambda x: x > 0, sn.placeholders.keys()) if len(keys) == 0: if 0 in sn.placeholders: self.goto_placeholder(self.active_placeholder, sn.placeholders[0]) else: buf.place_cursor(sn.begin_iter()) else: self.goto_placeholder(self.active_placeholder, sn.placeholders[keys[0]]) if sn in self.active_snippets: # Check if we can get end_iter in view without moving the # current cursor position out of view cur = buf.get_iter_at_mark(buf.get_insert()) last = sn.end_iter() curloc = self.view.get_iter_location(cur) lastloc = self.view.get_iter_location(last) if (lastloc.y + lastloc.height) - curloc.y <= \ self.view.get_visible_rect().height: self.view.scroll_mark_onscreen(sn.end_mark) buf.end_user_action() self.view.grab_focus() return True
class SQLiteSearch: def __init__(self, snippet_limit=3, take_words=3, results_limit=10, db_name='inverted-index.db', data_dir='pages'): self.time_needed_to_search = None self.db_name = db_name self.take_words = take_words self.snippet_limit = snippet_limit self.results_limit = results_limit self.data_dir = os.path.join(os.getcwd(), data_dir) self.all_results = None self.snip = Snippet(self.take_words, self.results_limit) self.stop_words_slovene = set( nltk.corpus.stopwords.words("Slovene") ).union({ "ter", "nov", "novo", "nova", "zato", "še", "zaradi", "a", "ali", "april", "avgust", "b", "bi", "bil", "bila", "bile", "bili", "bilo", "biti", "blizu", "bo", "bodo", "bojo", "bolj", "bom", "bomo", "boste", "bova", "boš", "brez", "c", "cel", "cela", "celi", "celo", "d", "da", "daleč", "dan", "danes", "datum", "december", "deset", "deseta", "deseti", "deseto", "devet", "deveta", "deveti", "deveto", "do", "dober", "dobra", "dobri", "dobro", "dokler", "dol", "dolg", "dolga", "dolgi", "dovolj", "drug", "druga", "drugi", "drugo", "dva", "dve", "e", "eden", "en", "ena", "ene", "eni", "enkrat", "eno", "etc.", "f", "februar", "g", "g.", "ga", "ga.", "gor", "gospa", "gospod", "h", "halo", "i", "idr.", "ii", "iii", "in", "iv", "ix", "iz", "j", "januar", "jaz", "je", "ji", "jih", "jim", "jo", "julij", "junij", "jutri", "k", "kadarkoli", "kaj", "kajti", "kako", "kakor", "kamor", "kamorkoli", "kar", "karkoli", "katerikoli", "kdaj", "kdo", "kdorkoli", "ker", "ki", "kje", "kjer", "kjerkoli", "ko", "koder", "koderkoli", "koga", "komu", "kot", "kratek", "kratka", "kratke", "kratki", "l", "lahka", "lahke", "lahki", "lahko", "le", "lep", "lepa", "lepe", "lepi", "lepo", "leto", "m", "maj", "majhen", "majhna", "majhni", "malce", "malo", "manj", "marec", "me", "med", "medtem", "mene", "mesec", "mi", "midva", "midve", "mnogo", "moj", "moja", "moje", "mora", "morajo", "moram", "moramo", "morate", "moraš", "morem", "mu", "n", "na", "nad", "naj", "najina", "najino", "najmanj", "naju", "največ", "nam", "narobe", "nas", "nato", "nazaj", "naš", "naša", "naše", "ne", "nedavno", "nedelja", "nek", "neka", "nekaj", "nekatere", "nekateri", "nekatero", "nekdo", "neke", "nekega", "neki", "nekje", "neko", "nekoga", "nekoč", "ni", "nikamor", "nikdar", "nikjer", "nikoli", "nič", "nje", "njega", "njegov", "njegova", "njegovo", "njej", "njemu", "njen", "njena", "njeno", "nji", "njih", "njihov", "njihova", "njihovo", "njiju", "njim", "njo", "njun", "njuna", "njuno", "no", "nocoj", "november", "npr.", "o", "ob", "oba", "obe", "oboje", "od", "odprt", "odprta", "odprti", "okoli", "oktober", "on", "onadva", "one", "oni", "onidve", "osem", "osma", "osmi", "osmo", "oz.", "p", "pa", "pet", "peta", "petek", "peti", "peto", "po", "pod", "pogosto", "poleg", "poln", "polna", "polni", "polno", "ponavadi", "ponedeljek", "ponovno", "potem", "povsod", "pozdravljen", "pozdravljeni", "prav", "prava", "prave", "pravi", "pravo", "prazen", "prazna", "prazno", "prbl.", "precej", "pred", "prej", "preko", "pri", "pribl.", "približno", "primer", "pripravljen", "pripravljena", "pripravljeni", "proti", "prva", "prvi", "prvo", "r", "ravno", "redko", "res", "reč", "s", "saj", "sam", "sama", "same", "sami", "samo", "se", "sebe", "sebi", "sedaj", "sedem", "sedma", "sedmi", "sedmo", "sem", "september", "seveda", "si", "sicer", "skoraj", "skozi", "slab", "smo", "so", "sobota", "spet", "sreda", "srednja", "srednji", "sta", "ste", "stran", "stvar", "sva", "t", "ta", "tak", "taka", "take", "taki", "tako", "takoj", "tam", "te", "tebe", "tebi", "tega", "težak", "težka", "težki", "težko", "ti", "tista", "tiste", "tisti", "tisto", "tj.", "tja", "to", "toda", "torek", "tretja", "tretje", "tretji", "tri", "tu", "tudi", "tukaj", "tvoj", "tvoja", "tvoje", "u", "v", "vaju", "vam", "vas", "vaš", "vaša", "vaše", "ve", "vedno", "velik", "velika", "veliki", "veliko", "vendar", "ves", "več", "vi", "vidva", "vii", "viii", "visok", "visoka", "visoke", "visoki", "vsa", "vsaj", "vsak", "vsaka", "vsakdo", "vsake", "vsaki", "vsakomur", "vse", "vsega", "vsi", "vso", "včasih", "včeraj", "x", "z", "za", "zadaj", "zadnji", "zakaj", "zaprta", "zaprti", "zaprto", "zdaj", "zelo", "zunaj", "č", "če", "često", "četrta", "četrtek", "četrti", "četrto", "čez", "čigav", "š", "šest", "šesta", "šesti", "šesto", "štiri", "ž", "že", "svoj", "jesti", "imeti", "\u0161e", "iti", "kak", "www", "km", "eur", "pač", "del", "kljub", "šele", "prek", "preko", "znova", "morda", "kateri", "katero", "katera", "ampak", "lahek", "lahka", "lahko", "morati", "torej", "gl", "xsd", "ipd", "om", "gt", "lt", "d.o.o" }) self.conn = sqlite3.connect(self.db_name) def build_query(self, words): # -3 because we want to remove the last or and space (so 3 chars) from the list rm = len('or ') * -1 if not isinstance(words, list): words = [words] whr = ' '.join(['word = \'{}\' or'.format(w) for w in words])[:rm] qry = "select documentName as file, SUM(frequency) 'freq', GROUP_CONCAT(indexes) as idx " \ "from posting " \ "where {} " \ "group by documentName " \ "order by freq desc ".format(whr) return qry @staticmethod def process_results(results): if len(results): return [[ result[0], result[1], [int(i) for i in result[2].split(',')] ] for result in results] def print_results(self, query, frequencies, pages, snippets): print(f'Results for query: "{" ".join(query)}"') print('{} results found in {:.0f}ms'.format( self.all_results, self.time_needed_to_search)) print("{:<12} {:<40} {}".format('Frequencies', 'Document', 'Snippets')) print("{} {} {}".format('-' * 12, '-' * 40, '-' * 80)) for i in range(min(self.results_limit, len(pages))): print("{:<12} {:<40} {}".format( frequencies[i], pages[i], '... ' + ' ... '.join(snippets[i][:self.snippet_limit]) + ' ...')) def search_db(self, words): cursor = self.conn.cursor() query = self.build_query(words) cursor.execute(query) results = cursor.fetchall() return self.process_results(results) def get_document(self, page_name): map_name = '.'.join(page_name.split('.')[:3]) return open(os.path.join(self.data_dir, "{}/{}".format(map_name, page_name)), 'r', encoding='utf-8') def search(self, words): # start the timer and do the search, after that calculate the time needed for search start_time = time.time() results = self.search_db(words) self.all_results = len(results) end_time = time.time() self.time_needed_to_search = (end_time - start_time) * 1000 pages, frequencies, snippets = [], [], [] ctr = 0 for page_file, frequency, indexes in results: if ctr > self.results_limit: break ctr += 1 pages.append(page_file) frequencies.append(frequency) document = self.get_document(page_file) soup = BeautifulSoup(document, features="html.parser") for script in soup(["script", "style"]): script.decompose() strips = list(soup.stripped_strings) text = ' '.join(strips) tokens = nltk.word_tokenize(text, language='Slovene', preserve_line=False) word_tokens = [] for i in range(len(tokens)): w = tokens[i].lower().replace("'", "").replace("'", '').replace( '`', '').replace('·', '') if not re.search('[a-žA-ž]', w): continue if len(w) == 1 and not re.match("^[A-Ža-ž0-9]*$", w): continue if len(w) >= 2 and not re.match("^[A-Ža-ž0-9]*$", w[-1]): w = w[:-1] if len(w) >= 2 and not re.match("^[A-Ža-ž0-9]*$", w[0]): w = w[1:] if w not in self.stop_words_slovene: word_tokens.append(w) # snippets.append(self.get_snip(tokens, indexes)) snippets.append(self.snip.get_snip(tokens, indexes)) self.print_results(words, frequencies, pages, snippets)
import urwid from difflib import SequenceMatcher from snippet import Snippet g_verse = None # placeholder for currently selected verse g_verse = Snippet.random() def do_diff(text, n_text): seqm = SequenceMatcher(None, text, n_text) output_orig = [] output_new = [] for opcode, a0, a1, b0, b1 in seqm.get_opcodes(): orig_seq = seqm.a[a0:a1] new_seq = seqm.b[b0:b1] if opcode == 'equal': output_orig.append(orig_seq) output_new.append(orig_seq) elif opcode == 'insert': output_new.append(('extra_text', new_seq)) elif opcode == 'delete': output_orig.append(('missing_text', orig_seq)) elif opcode == 'replace': output_new.append(('wrong_text', new_seq)) output_orig.append(('wrong_text', orig_seq)) else: raise ('Error') return output_orig, output_new, seqm.quick_ratio()
#!/usr/bin/python3 from snippet import Snippet from utils import * text = """ #!/usr/bin python3 print('hello world!') """ with open('example.html') as fp: text = fp.read().strip() text = text[:int(len(text) / 32)] snippet = Snippet(text) png_path = 'hello.png' screen_width_px, screen_height_px = 1920, 1080 font_size_px = 16 snippet.to_image(png_path, font_size=font_size_px) size = [1920, 1080] scale_image(png_path, 'scaled.png', size)
def __init__(self, snippet): GObject.Object.__init__(self) self._snippet = Snippet(snippet)
def split_snippets(s: Snippet, l: int): length = s.length n = max(length // l, 1) # at least 1 return s.split(n, l)
def apply_directive(self, directive, bk, code=None, s=None, repl_records=None): """Apply directives. The value in directive[2] depends on the command: 'replace': the argument is the string to use as a replacement 'push': the argument is the filepath to save into """ snip_id = directive[0] cmd = directive[1] if cmd == 'push': # Perform replacements on the directive argument if necessary arg = directive[2] if repl_records is not None: for rec in repl_records: # rec[0] = snippet id, rec[1] = array of replacement values if rec[0] == snip_id: arg = replace_values(arg, rec[1]) break # Save the current (code, filepath) on the stack bk.code_stack.append((code + s, arg)) code = s = '' elif cmd == 'pop': # Finalize code being generated code += s # Pop up one stack level and resume that level's code generation prev_code, filepath = bk.code_stack.pop() print(f'{self.id}: apply_directive: writing "{filepath}"') # Write out to file chrooted = self.id.startswith('6.') or self.id.startswith('9.') with open(filepath, 'w') as f: f.write( file_header(filepath, bk, chrooted=chrooted) + code + file_footer(filepath)) # Reset code generation for this stack level code = prev_code s = '' elif cmd == 'add': arg = directive[2] t = Snippet('userinput', arg) # Replace placeholders with actual values if repl_records is not None: for rec in repl_records: # rec[0] = snippet id, rec[1] = array of replacement values if rec[0] == snip_id: t.text = t.replace_values(rec[1]) break s += t.generate(f'{self.id}_{snip_id} [add]') else: print(f'Sect: {self.id}, snippet directive: {snip_id}' + f', unknown command "{cmd}"') return code, s